transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit be38d9c29ac0a5622e4a7e1a2c7ee10d50b37c84
parent 245d58622d19538e858392c22c9230e4b018962e
Author: lumidify <nobody@lumidify.org>
Date:   Mon,  6 Apr 2020 18:01:22 +0200

Remove choiceoverride and add override option to replace statements

Diffstat:
Mtests/data/override.txt | 2+-
Mtests/test6/config | 3++-
Mtransliterate.pl | 79++++++++++++++++++++++++++++++++-----------------------------------------------
3 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/tests/data/override.txt b/tests/data/override.txt @@ -1 +1 @@ -word0_replaced$word0_replaced2 word0_replaced2 +word0 word0_replaced2 diff --git a/tests/test6/config b/tests/test6/config @@ -5,7 +5,7 @@ afterword "[ \n]" ignore "../data/ignore.txt" table words "../data/words.txt" table endings "../data/endings.txt" -choiceoverride "../data/override.txt" +table overrides "../data/override.txt" expand words endings @@ -13,4 +13,5 @@ match "\d+" "num_replaced" beginword group beginword endword replace words +replace overrides override endgroup diff --git a/transliterate.pl b/transliterate.pl @@ -36,7 +36,7 @@ sub get_unique_words { # Adds all words in $words to $trie # Automatically combines duplicate words with "$config->{choicesep}" inbetween sub add_to_trie { - my ($table_name, $trie, $words, $args, $config) = @_; + my ($table_name, $trie, $words, $args, $config, $override) = @_; foreach my $word (keys %$words) { my $cur_node = $trie; foreach my $char (split //, $word) { @@ -52,6 +52,10 @@ sub add_to_trie { $cur_node = $cur_node->{$char}; } if (exists($cur_node->{"final"})) { + if ($override) { + $cur_node->{"final"} = $words->{$word}; + next; + } if ($args->{"checkduplicates"}) { warn "WARNING: Duplicate word \"$word\". Last occurrence as " . "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " . @@ -251,11 +255,6 @@ sub prompt_choose_word { my @replacements; foreach (0..$#$substrings) { if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) { - if (exists $config->{"choiceoverride"} && - exists $config->{"choiceoverride"}->{$substrings->[$_]->[1]}) { - $substrings->[$_]->[1] = $config->{"choiceoverride"}->{$substrings->[$_]->[1]}; - next; - } # This ugly bit of code is here as a special case for transliterating # Hindi to Urdu text - if there are *exactly* two choices and one # contains diacritics but the other one doesn't, the one with diacritics @@ -264,6 +263,7 @@ sub prompt_choose_word { my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1]; my @diacritics = @{$config->{"targetdiacritics"}}; if (@choices == 2) { + @choices = map {NFD($_)} @choices; my $first_matches = grep {$choices[0] =~ /$_/} @diacritics; my $second_matches = grep {$choices[1] =~ /$_/} @diacritics; if ($first_matches && !$second_matches) { @@ -377,7 +377,7 @@ sub prompt_choose_word { foreach my $choice (@choices) { $choice_nums{$choice} = 0; foreach my $diacritic (@{$config->{"targetdiacritics"}}) { - my @matches = $choice =~ /$diacritic/; + my @matches = NFD($choice) =~ /$diacritic/; $choice_nums{$choice} += scalar @matches if @matches; } } @@ -530,7 +530,7 @@ sub open_file_rel_abs { # Load a file of replacement words into a hash table sub load_table { - my ($filename, $args, $config, $src_verbatim, $revert) = @_; + my ($filename, $args, $config, $revert) = @_; my $fh = open_file_rel_abs $filename, $args->{"config"}; return if !$fh; my %table; @@ -547,12 +547,12 @@ sub load_table { my $replacement; if ($revert) { $word = NFD $words[1]; - $replacement = NFD $words[0]; + $replacement = $words[0]; } else { $word = NFD $words[0]; - $replacement = NFD $words[1]; + $replacement = $words[1]; } - my @word_choices = $src_verbatim ? ($word) : split /\Q$config->{choicesep}\E/, $word; + my @word_choices = split /\Q$config->{choicesep}\E/, $word; foreach my $word_choice (@word_choices) { if (exists $table{$word_choice}) { if ($args->{"checkduplicates"}) { @@ -701,8 +701,7 @@ sub interpret_config { "group" => [], "endgroup" => [], "diacritics" => [$STRING], - "targetdiacritics" => [$STRING], - "choiceoverride" => [$STRING] + "targetdiacritics" => [$STRING] ); my $in_group = 0; foreach my $cmd (@$config_list) { @@ -727,7 +726,7 @@ sub interpret_config { if (exists $path_to_table{$table_path}) { $table = $path_to_table{$table_path}; } else { - $table = load_table $table_path, $args, \%config, 0, $table_args{"revert"}; + $table = load_table $table_path, $args, \%config, $table_args{"revert"}; return if !defined $table; $path_to_table{$table_path} = $table; } @@ -740,18 +739,6 @@ sub interpret_config { # this is a hash to avoid duplicates if the same file # is loaded multiple times $config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"}; - } elsif ($cmd_name eq "choiceoverride") { - my $table_path = $cmd->[1]->{"value"}; - # argument $src_verbatim to load_table forces it to take the entire - # source word without splitting it up with "choicesep", since here, - # we explicitly want to replace multiple choices with one - my $table = load_table $table_path, $args, \%config, 1; - return if !defined $table; - if (exists $config{"choiceoverride"}) { - warn "Duplicate specification of \"choiceoverride\" option.\n"; - return; - } - $config{"choiceoverride"} = $table; } elsif ($cmd_name eq "expand") { my $orig_table_id = $cmd->[1]->{"value"}; my $ending_table_id = $cmd->[2]->{"value"}; @@ -798,7 +785,7 @@ sub interpret_config { push @{$config{"replacements"}}, { "type" => "match", "search" => NFD($cmd->[1]->{"value"}), - "replace" => NFD($cmd->[2]->{"value"})}; + "replace" => $cmd->[2]->{"value"}}; for (3..$#$cmd) { # add optional arguments as keys in replacement config $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; @@ -835,7 +822,8 @@ sub interpret_config { # here since we can't ever get this far without first having # loaded a table anyways my $trie_root = $config{"replacements"}->[-1]->{"words"}; - add_to_trie($table, $trie_root, $tables{$table}, $args, \%config); + my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override"; + add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override); } elsif ($cmd_name eq "diacritics" || $cmd_name eq "targetdiacritics") { if (!exists $config{$cmd_name}) { $config{$cmd_name} = []; @@ -1322,7 +1310,7 @@ sub replace { } foreach (@$substrings) { - print $outputfh NFC($_->[1]); + print $outputfh $_->[1]; } } close $fh; @@ -1805,6 +1793,9 @@ replacement word can optionally have several parts separated by B<choicesep>. If original word has multiple parts, it is separated and each of the parts is added to the table with the replacement. If the replacement has multiple parts, the user will be prompted to choose one of the options during the transliteration process. +If the same word occurs multiple times in the same table with different replacements, +the replacements are automatically added as choices that will be handled by the +L<word choice window|/"WORD CHOICE WINDOW">. If, for whatever reason, the same table is needed twice, but with different endings, the table can simply be loaded twice with different IDs. If the same path is loaded, @@ -1823,21 +1814,6 @@ cannot currently think of any reason why someone would want to load a file both with and without B<revert> in the same config, but I still wanted to add this warning just in case. -=item B<choiceoverride> <table path> - -Reads the mapping in the table at C<< <table path> >> and uses it to override -the choice mechanism. - -The table contains a mapping of choices (separated by B<choicesep>) to single -replacements. This was added to help in Urdu<->Hindi transliteration with the -same database, since sometimes words with and without diacritics that actually -mean the same thing are added for one direction but should default to one of -them in the other direction. - -Note that this does not sort the choices before comparison and they have to -be matched exactly, so when a new choice is added, that needs to be added to -this mapping as well, in exactly the same order. - Like B<targetdiacritics>, this has no effect if B<--nochoices> is set. To clarify the order in which choices are added (if they are not explicitly @@ -1897,13 +1873,22 @@ and B<endgroup>, since they are then grouped together and replaced in one go. B<beginword> and B<endword> act in the same way as specified for B<match> and apply to all B<replace> statements in this group. -=item B<replace> <table identifier> +=item B<replace> <table identifier> [override] Replace all words in the table with the identifier C<< <table identifier> >>, using the B<beginword> and B<endword> settings specified by the current group. -Note that a table must have been loaded (or generated using B<expand>) -before being used in a B<replace> statement. +Unless B<override> is set on the latter table, if the same word occurs in two +tables with different replacements, both are automatically added as choices. +See L</"WORD CHOICE WINDOW">. + +B<override> can be useful if the same database is used for both directions and +one direction maps multiple words to one word, but in the other direction this +word should always default to one of the choices. In that case, a small table +with these special cases can be created and put at the end of the main B<group> +statement with B<override> set. + +Note that a table must have been loaded before being used in a B<replace> statement. =item B<endgroup>