transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 186074f5c746c6f3452da864722a9fd7da2dd964
parent 1adf70462c0ec838b8611ce505c139862a768e6d
Author: lumidify <nobody@lumidify.org>
Date:   Mon, 30 Mar 2020 13:04:56 +0200

Fix 'Add replacement' with words requiring expansions in the GUI; tests currently broken

Diffstat:
Atests/data/endings2.txt | 2++
Mtests/data/words.txt | 1+
Atests/test4/config | 18++++++++++++++++++
Atests/test4/descr.txt | 1+
Atests/test4/err.txt | 3+++
Atests/test4/expected.txt | 3+++
Atests/test4/input.txt | 1+
Mtransliterate.pl | 196++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
8 files changed, 163 insertions(+), 62 deletions(-)

diff --git a/tests/data/endings2.txt b/tests/data/endings2.txt @@ -0,0 +1,2 @@ +end4 end4_replaced +end5 end5_replaced diff --git a/tests/data/words.txt b/tests/data/words.txt @@ -8,3 +8,4 @@ word6 word6_replaced word7 word7_replaced word8 word8_replaced word9 word9_replaced +word10 word10_replaced diff --git a/tests/test4/config b/tests/test4/config @@ -0,0 +1,18 @@ +split "\s+" +beforeword "\s" +afterword "\s" + +ignore "../data/ignore.txt" +table words "../data/words.txt" +table words1 "../data/words.txt" +table endings "../data/endings.txt" +table endings2 "../data/endings2.txt" + +expand words endings noroot +expand words endings2 noroot +expand words1 endings2 noroot + +group beginword endword +replace words +replace words1 +endgroup diff --git a/tests/test4/descr.txt b/tests/test4/descr.txt @@ -0,0 +1 @@ +matchignore only endword; expand noroot diff --git a/tests/test4/err.txt b/tests/test4/err.txt @@ -0,0 +1,3 @@ +Unknown word: "word0" +Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices. +Unknown word: "-dword9end2" diff --git a/tests/test4/expected.txt b/tests/test4/expected.txt @@ -0,0 +1,3 @@ +ignore +word0 word1_replacedend1r1|word1_replacedend1r2 +-dword9end2 word9_replacedend2r-d diff --git a/tests/test4/input.txt b/tests/test4/input.txt @@ -0,0 +1 @@ +word10end1end5 word10end5 diff --git a/transliterate.pl b/transliterate.pl @@ -561,28 +561,7 @@ sub load_ignore_table { # 0 - an error occurred # 1 - everything's fine sub expand_table { - my ($cmd, $tables, $config) = @_; - my $table_name = $cmd->[1]->{"value"}; - my $new_table_name = $cmd->[2]->{"value"}; - my $forms_name = $cmd->[3]->{"value"}; - if ($new_table_name eq "same") { - $new_table_name = $table_name; - } - if (!exists($tables->{$table_name})) { - warn "expand_table: table \"$table_name\" does not exist.\n"; - return 0; - } - if (!exists($tables->{$forms_name})) { - warn "expand_table: table \"$forms_name\" does not exist.\n"; - return 0; - } - my $table = $tables->{$table_name}; - my $forms = $tables->{$forms_name}; - - my $noroot = 0; - if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") { - $noroot = 1; - } + my ($table, $forms, $noroot, $config) = @_; my %new_table; foreach my $word (keys %$table) { @@ -603,8 +582,7 @@ sub expand_table { $new_table{$word} = $table->{$word}; } } - $tables->{$new_table_name} = \%new_table; - return 1; + return \%new_table; } # Check if the number and types of arguments given to a config command are right @@ -648,9 +626,19 @@ sub check_args { # $config_list - the list returned by parse_config sub interpret_config { my ($config_list, $args) = @_; + my %tables; my %config; + # table_paths stores a list of all table and replacement ids that are + # impacted by the path so the replacement can be added on the fly when + # a new replacement is added from the GUI + # the "replacement id" is just the number of the replacement group, + # starting at 0 with the first group in the config $config{"table_paths"} = {}; - $config{"tables"} = {}; + # a mapping between the table ids and tables for all tables used as + # ending tables in expand statements - so expansions can be done + # on the fly when adding a replacement word from the GUI + $config{"ending_tables"} = {}; + # ignore is the path to the ignore file, ignore_words the actual table $config{"ignore"} = ""; $config{"ignore_words"} = {}; $config{"split"} = "\\s"; @@ -658,12 +646,16 @@ sub interpret_config { $config{"afterword"} = "\\s"; $config{"tablesep"} = "\t"; $config{"choicesep"} = "\$"; + # a list of "replacement configs", which specify the type and any + # other arguments (this is given to replace_match, etc. $config{"replacements"} = []; - my %tmp_table_paths; + # these are temporary mappings used while loading the config + my %path_to_table; + my %table_id_to_path; my %mandatory_args = ( "ignore" => [$STRING], "table" => [$ID], - "expand" => [$ID, $ID, $ID], + "expand" => [$ID, $ID], "match" => [$STRING, $STRING], "matchignore" => [$STRING], "replace" => [$ID], @@ -688,17 +680,42 @@ sub interpret_config { return undef; } if ($cmd->[0]->{"value"} eq "table") { - my $table = load_table $cmd->[2]->{"value"}, $args, \%config; - if (!$table) { - return undef; + my $table_path = $cmd->[2]->{"value"}; + my $table; + # add to temporary path-to-table mapping so tables aren't + # loaded unnecessarily + if (exists $path_to_table{$table_path}) { + $table = $path_to_table{$table_path}; + } else { + $table = load_table $table_path, $args, \%config; + $path_to_table{$table_path} = $table; + return if !$table; } - $config{tables}->{$cmd->[1]->{"value"}} = $table; - $tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"}; + my $table_id = $cmd->[1]->{"value"}; + $tables{$table_id} = $table; + $table_id_to_path{$table_id} = $table_path; } elsif ($cmd->[0]->{"value"} eq "expand") { - # FIXME: need to handle table paths when a new table name is used for the expansion - if (!expand_table($cmd, $config{tables}, \%config)) { - return undef; + my $orig_table_id = $cmd->[1]->{"value"}; + my $ending_table_id = $cmd->[2]->{"value"}; + my $noroot = 0; + if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") { + $noroot = 1; + } + if (!exists $tables{$orig_table_id}) { + warn "expand: table \"$orig_table_id\" doesn't exist\n"; + return; + } elsif (!exists $tables{$ending_table_id}) { + warn "expand: table \"$ending_table_id\" doesn't exist\n"; + return } + + $config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id}; + $config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id}; + push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot]; + + my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config); + return if !defined $new_table; + $tables{$orig_table_id} = $new_table; } elsif ($cmd->[0]->{"value"} eq "group") { if ($in_group) { warn "ERROR: New group started without ending last one in config\n"; @@ -725,6 +742,7 @@ sub interpret_config { "search" => NFD($cmd->[1]->{"value"}), "replace" => $cmd->[2]->{"value"}}; for (3..$#$cmd) { + # add optional arguments as keys in replacement config $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; } } elsif ($cmd->[0]->{"value"} eq "matchignore") { @@ -742,17 +760,24 @@ sub interpret_config { return undef; } my $table = $cmd->[1]->{"value"}; - if (!exists($config{tables}->{$table})) { + if (!exists($tables{$table})) { warn "ERROR: nonexistent table \"$table\" in replace statement.\n"; return undef; } - if (exists($tmp_table_paths{$table})) { - $config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table]; - } + + # make a list of all replacements that are affected by this + # file so that they can be updated when a word is added + # through the gui + my $table_path = $table_id_to_path{$table}; + my $replacement_id = $#{$config{"replacements"}}; + $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path}; + push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table]; + # Note: we don't need to check if $table{"choicesep"} was defined # here since we can't ever get this far without first having # loaded a table anyways - add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $config{tables}->{$table}, $args, \%config); + my $trie_root = $config{"replacements"}->[-1]->{"words"}; + add_to_trie($table, $trie_root, $tables{$table}, $args, \%config); } elsif ($cmd->[0]->{"value"} eq "split") { $config{"split"} = $cmd->[1]->{"value"}; } elsif ($cmd->[0]->{"value"} eq "beforeword") { @@ -857,9 +882,21 @@ sub handle_unknown_word_action { } print($fh $word . $config->{tablesep} . $replace_word . "\n"); close($fh); - my $replacement_id = $config->{"table_paths"}->{$table_path}->[0]; - my $trie = $config->{"replacements"}->[$replacement_id]->{"words"}; - add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word}, $args, $config); + # loop over all table ids that are impacted by this file + foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) { + my $replacement_id = $replacement->[0]; + my $table_id = $replacement->[1]; + my $trie = $config->{"replacements"}->[$replacement_id]->{"words"}; + my $final_table = {$word => $replace_word}; + # handle expansions + foreach my $expand (@{$config->{"expands"}->{$table_id}}) { + my $ending_table_id = $expand->[0]; + my $noroot = $expand->[1]; + my $endings_table = $config->{"ending_tables"}->{$ending_table_id}; + $final_table = expand_table $final_table, $endings_table, $noroot, $config; + } + add_to_trie($table_id, $trie, $final_table, $args, $config); + } return 1; } elsif ($action->[0] eq "reload") { my $new_config = load_config $args; @@ -1451,18 +1488,15 @@ pressed. The filtering for which table files are actually shown here is currently a bit rudimentary. First, all paths that are used in the -B<table> statements in the config are put into a list. Then, the -paths corresponding to any tables used for word endings in the -B<expand> statements are removed from the list, and that is what -is shown in this window. The reason for removing those paths from -the list is that it gets somewhat confusing when all the tables -that are only used for word endings are also in the list, and it -is very unlikely that an unknown word would need to be written to -one of those files. If necessary, the word can always be added -manually and the config reloaded. Note also that only actual -table paths are shown here, not the tables themselves - this is -not necessarily a one-to-one mapping since new tables can be -generated with the B<expand> statements. +B<table> statements in the config are put into a list. Then, only +the paths corresponding to table IDs actually used in B<replace> +statements are added to the list that gets shown in the GUI. The +reason for not showing all table paths in the list is that it gets +somewhat confusing when all the tables that are only used for word +endings are also in the list, and it is very unlikely that an +unknown word would need to be written to one of those files. If +necessary, the word can always be added manually and the config +reloaded. =item Reload config @@ -1570,6 +1604,17 @@ cannot be used as part of the replacement string in B<match> statements. Lookaheads and lookbehinds are fine, though, and could be useful in certain cases. +All tables must be loaded before they are used, or there will be an +error that the table does not exist. + +Warning: If a B<replace> statement is located before an B<expand> +statement that would have impacted the table used, there will be no +error but the expand statement won't have any impact. + +Basic rule of thumb: Always put the B<table> statements before the +B<expand> statements and the B<expand> statements before the B<replace> +statements. + =over 8 =item B<split> <regex string> @@ -1643,12 +1688,21 @@ original script first and the replacement word second. The replacement word can optionally have several parts separated by B<choicesep>, which will cause the user to be prompted to choose one of the options. -=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot] +If, for whatever reason, the same table is needed twice, but with different endings, +the table can simply be loaded twice with different IDs. If the same path is loaded, +the table that has already been loaded will be reused. + +=item B<expand> <table identifier> <word ending table> [noroot] Expand the table C<< <table identifier> >>, i.e. generate all the word forms using the word endings in C<< <word ending table> >>, saving the result as a table with the -identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table -identifier> >>, the original C<< <table identifier> >> is used instead. +identifier C<< <new table identifier> >>. + +Note: There used to be a C<< <new table identifier> >> argument to create a new +table in case one table had to be expanded with different endings. This has been +removed because it was a bit ugly, especially since there wasn't a proper mapping +from table IDs to filenames anymore. If this functionality is needed, the same table +file can simply be loaded multiple times. See the B<table> section above. If B<noroot> is set, the root forms of the words are not kept. @@ -1660,10 +1714,6 @@ Note that each of the root words is also split into its choices (if necessary) during the expanding, so it is possible to use B<choicesep> in both the endings and root words. -Note that the paths of all tables used for C<< <word ending table> >> are removed from the -list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD -WINDOW"> for details. - =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal] Perform a RegEx match using the given C<< <regex string> >>, replacing it with @@ -1701,6 +1751,28 @@ End a replacement group. =back +=head1 BUGS + +Although it may not seem like it, one of the ugliest parts of the program is the +GUI functionality that allows the user to add a replacement word. The problem is +that all information about the B<expand> and B<replace> statements has to be kept +in order to properly handle adding a word to one of the files and simultaneously +adding it to the currently loaded tables I<without reloading the entire config>. +The way it currently works, the replacement word is directly written to the file, +then all B<expand> statements that would have impacted the words from this file +are redone (just for the newly added word) and the resulting words are added to +the appropriate tables (or, technically, the appropriate 'trie'). Since a file +can be mapped to multiple table IDs and a table ID can occur in multiple replace +statements, this is more complicated than it sounds, and thus it is very likely +that there are bugs lurking here somewhere. Do note that "Reload config" will +B<always> reload the entire configuration, so that's safe to do even if the +on-the-fly replacing doesn't work. + +In general, I have tested the GUI code much less than the rest since you can't +really test it automatically very well. + +Tell me if you find any bugs. + =head1 SEE ALSO perlre, perlretut