transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit c3b7a4e83d73104c627e8e6eafaf9e17c47e51ce
parent 468067a8fdf246671997930166c26957b052d8cf
Author: lumidify <nobody@lumidify.org>
Date:   Mon,  6 Apr 2020 09:43:30 +0200

Add choiceoverride option

Diffstat:
Mtransliterate.pl | 47++++++++++++++++++++++++++++++++++++++---------
1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/transliterate.pl b/transliterate.pl @@ -248,6 +248,11 @@ sub prompt_choose_word { my @replacements; foreach (0..$#$substrings) { if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) { + if (exists $config->{"choiceoverride"} && + exists $config->{"choiceoverride"}->{$substrings->[$_]->[1]}) { + $substrings->[$_]->[1] = $config->{"choiceoverride"}->{$substrings->[$_]->[1]}; + next; + } # This ugly bit of code is here as a special case for transliterating # Hindi to Urdu text - if there are *exactly* two choices and one # contains diacritics but the other one doesn't, the one with diacritics @@ -256,7 +261,6 @@ sub prompt_choose_word { my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1]; my @diacritics = @{$config->{"targetdiacritics"}}; if (@choices == 2) { - @choices = map {NFD($_)} @choices; my $first_matches = grep {$choices[0] =~ /$_/} @diacritics; my $second_matches = grep {$choices[1] =~ /$_/} @diacritics; if ($first_matches && !$second_matches) { @@ -370,7 +374,7 @@ sub prompt_choose_word { foreach my $choice (@choices) { $choice_nums{$choice} = 0; foreach my $diacritic (@{$config->{"targetdiacritics"}}) { - my @matches = NFD($choice) =~ /$diacritic/; + my @matches = $choice =~ /$diacritic/; $choice_nums{$choice} += scalar @matches if @matches; } } @@ -538,10 +542,10 @@ sub load_table { my $replacement; if ($revert) { $word = NFD $words[1]; - $replacement = $words[0]; + $replacement = NFD $words[0]; } else { $word = NFD $words[0]; - $replacement = $words[1]; + $replacement = NFD $words[1]; } my @word_choices = split /\Q$config->{choicesep}\E/, $word; foreach my $word_choice (@word_choices) { @@ -692,7 +696,8 @@ sub interpret_config { "group" => [], "endgroup" => [], "diacritics" => [$STRING], - "targetdiacritics" => [$STRING] + "targetdiacritics" => [$STRING], + "choiceoverride" => [$STRING] ); my $in_group = 0; foreach my $cmd (@$config_list) { @@ -718,7 +723,7 @@ sub interpret_config { $table = $path_to_table{$table_path}; } else { $table = load_table $table_path, $args, \%config, $table_args{"revert"}; - return if !$table; + return if !defined $table; $path_to_table{$table_path} = $table; } if ($table_args{"revert"}) { @@ -730,6 +735,15 @@ sub interpret_config { # this is a hash to avoid duplicates if the same file # is loaded multiple times $config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"}; + } elsif ($cmd_name eq "choiceoverride") { + my $table_path = $cmd->[1]->{"value"}; + my $table = load_table $table_path, $args, \%config; + return if !defined $table; + if (exists $config{"choiceoverride"}) { + warn "Duplicate specification of \"choiceoverride\" option.\n"; + return; + } + $config{"choiceoverride"} = $table; } elsif ($cmd_name eq "expand") { my $orig_table_id = $cmd->[1]->{"value"}; my $ending_table_id = $cmd->[2]->{"value"}; @@ -776,7 +790,7 @@ sub interpret_config { push @{$config{"replacements"}}, { "type" => "match", "search" => NFD($cmd->[1]->{"value"}), - "replace" => $cmd->[2]->{"value"}}; + "replace" => NFD($cmd->[2]->{"value"})}; for (3..$#$cmd) { # add optional arguments as keys in replacement config $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; @@ -834,7 +848,7 @@ sub interpret_config { } elsif ($cmd_name eq "ignore") { $config{"ignore"} = $cmd->[1]->{"value"}; my $table = load_ignore_table $cmd->[1]->{"value"}, $args; - return if !$table; + return if !defined $table; $config{"ignore_words"} = $table; } else { warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n"; @@ -1300,7 +1314,7 @@ sub replace { } foreach (@$substrings) { - print $outputfh $_->[1]; + print $outputfh NFC($_->[1]); } } close $fh; @@ -1796,6 +1810,21 @@ cannot currently think of any reason why someone would want to load a file both with and without B<revert> in the same config, but I still wanted to add this warning just in case. +=item B<choiceoverride> <table path> + +Reads the mapping in the table at C<< <table path> >> and uses it to override +the choice mechanism. + +The table contains a mapping of choices (separated by B<choicesep>) to single +replacements. This was added to help in Urdu<->Hindi transliteration with the +same database, since sometimes words with and without diacritics that actually +mean the same thing are added for one direction but should default to one of +them in the other direction. + +Note that this does not sort the choices before comparison and they have to +be matched exactly, so when a new choice is added, that needs to be added to +this mapping as well, in exactly the same order. + =item B<expand> <table identifier> <word ending table> [noroot] Expand the table C<< <table identifier> >>, i.e. generate all the word forms using