#!/usr/bin/perl # File : fixhw.pl # Author: Lyndon Hill, http://www.lyndonhill.com # Note : This script is free to use. If you make significant improvements please # send me a copy. # # Script to fix headwords # If a headword is defined in one sense then bedic repeats it in the next sense, # however, in my opinion this is wrong - it should be limited to the sense it is defined in. # This really screws up my duplicate fix script because the headword gets propagated to the # next sense. # This script reads the dictionary, when it finds an entry it looks for a sense and # headword. If there is no headword then it immediately outputs the sense but if there # is a headword then it stores it. At the end of the entry, all stored senses are output. # The result is, all headword-less senses should be at the start of the entry. # The requirement is that your dictionary is formatted nicely, e.g.: # entry # {s} # {hw}headword{/hw} # definition 1 # {/s} # {s} # definition 2 # {/s} # will be corrected to: # entry # {s} # definition 2 # {/s} # {s} # {hw}headword{/hw} # definition 1 # {/s} # If it is important to you that definition 1 comes before definition 2 (in most dictionaries # I'm guessing it doesn't matter because you wouldn't define a headword unless it's a variant), # then you should define a headword for definition 2. However, if you always define a headword # for every sense then your dictionary will take up more disk space. # Here are some variables you can change # input dictionary file $dicfile = "en-ka.bedic-new"; # output file $output = "en-ka.bedic-hwfix"; # Start of script open(DICFILE, $dicfile); open(BEDICFILE, ">$output"); $ecount = 0; # total number of entries $nextline = 0; # flag to say end of entry (next line is an entry) while() { $line = $_; chop $line; # gotta deal with header if($nextline == 1) { $senses = 0; # number of senses in this entry $storedlines = 0; # number of lines stored for this entry $nothw = 1; # flag that headword has been detected in this sense # read the entry $entry = $line; print BEDICFILE $entry ."\n"; $ecount++; while() { $line = $_; chop $line; if($line eq "") { # end of entry, dump stored sense lines for($s = 0; $s < $storedlines; $s++) { print BEDICFILE $stored[$s] . "\n"; } print BEDICFILE "\n"; last; } if($line eq "{s}" ) { $senses++; # next line is either hw, ss or definition $line = ; chop $line; $headword = $line; if($headword =~ /\{hw\}/) { $stored[$storedlines++] = "{s}"; $stored[$storedlines++] = "$line"; $nothw = 0; } else { # empty headword - output immediately print BEDICFILE "{s}\n"; print BEDICFILE "$line\n"; $nothw = 1; } } else { # not start of sense: are we storing or outputting ? if($nothw == 1) { # output print BEDICFILE "$line\n"; } else { # store $stored[$storedlines++] = "$line"; } } } } else { if($line eq "") { $nextline = 1; } # header print BEDICFILE $line . "\n"; } } print "$ecount entries.\n"; print "DONE!\n"; close(DICFILE); close(BEDICFILE); exit;