commit c3b7a4e83d73104c627e8e6eafaf9e17c47e51ce
parent 468067a8fdf246671997930166c26957b052d8cf
Author: lumidify <nobody@lumidify.org>
Date: Mon, 6 Apr 2020 09:43:30 +0200
Add choiceoverride option
Diffstat:
1 file changed, 38 insertions(+), 9 deletions(-)
diff --git a/transliterate.pl b/transliterate.pl
@@ -248,6 +248,11 @@ sub prompt_choose_word {
my @replacements;
foreach (0..$#$substrings) {
if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
+ if (exists $config->{"choiceoverride"} &&
+ exists $config->{"choiceoverride"}->{$substrings->[$_]->[1]}) {
+ $substrings->[$_]->[1] = $config->{"choiceoverride"}->{$substrings->[$_]->[1]};
+ next;
+ }
# This ugly bit of code is here as a special case for transliterating
# Hindi to Urdu text - if there are *exactly* two choices and one
# contains diacritics but the other one doesn't, the one with diacritics
@@ -256,7 +261,6 @@ sub prompt_choose_word {
my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1];
my @diacritics = @{$config->{"targetdiacritics"}};
if (@choices == 2) {
- @choices = map {NFD($_)} @choices;
my $first_matches = grep {$choices[0] =~ /$_/} @diacritics;
my $second_matches = grep {$choices[1] =~ /$_/} @diacritics;
if ($first_matches && !$second_matches) {
@@ -370,7 +374,7 @@ sub prompt_choose_word {
foreach my $choice (@choices) {
$choice_nums{$choice} = 0;
foreach my $diacritic (@{$config->{"targetdiacritics"}}) {
- my @matches = NFD($choice) =~ /$diacritic/;
+ my @matches = $choice =~ /$diacritic/;
$choice_nums{$choice} += scalar @matches if @matches;
}
}
@@ -538,10 +542,10 @@ sub load_table {
my $replacement;
if ($revert) {
$word = NFD $words[1];
- $replacement = $words[0];
+ $replacement = NFD $words[0];
} else {
$word = NFD $words[0];
- $replacement = $words[1];
+ $replacement = NFD $words[1];
}
my @word_choices = split /\Q$config->{choicesep}\E/, $word;
foreach my $word_choice (@word_choices) {
@@ -692,7 +696,8 @@ sub interpret_config {
"group" => [],
"endgroup" => [],
"diacritics" => [$STRING],
- "targetdiacritics" => [$STRING]
+ "targetdiacritics" => [$STRING],
+ "choiceoverride" => [$STRING]
);
my $in_group = 0;
foreach my $cmd (@$config_list) {
@@ -718,7 +723,7 @@ sub interpret_config {
$table = $path_to_table{$table_path};
} else {
$table = load_table $table_path, $args, \%config, $table_args{"revert"};
- return if !$table;
+ return if !defined $table;
$path_to_table{$table_path} = $table;
}
if ($table_args{"revert"}) {
@@ -730,6 +735,15 @@ sub interpret_config {
# this is a hash to avoid duplicates if the same file
# is loaded multiple times
$config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"};
+ } elsif ($cmd_name eq "choiceoverride") {
+ my $table_path = $cmd->[1]->{"value"};
+ my $table = load_table $table_path, $args, \%config;
+ return if !defined $table;
+ if (exists $config{"choiceoverride"}) {
+ warn "Duplicate specification of \"choiceoverride\" option.\n";
+ return;
+ }
+ $config{"choiceoverride"} = $table;
} elsif ($cmd_name eq "expand") {
my $orig_table_id = $cmd->[1]->{"value"};
my $ending_table_id = $cmd->[2]->{"value"};
@@ -776,7 +790,7 @@ sub interpret_config {
push @{$config{"replacements"}}, {
"type" => "match",
"search" => NFD($cmd->[1]->{"value"}),
- "replace" => $cmd->[2]->{"value"}};
+ "replace" => NFD($cmd->[2]->{"value"})};
for (3..$#$cmd) {
# add optional arguments as keys in replacement config
$config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
@@ -834,7 +848,7 @@ sub interpret_config {
} elsif ($cmd_name eq "ignore") {
$config{"ignore"} = $cmd->[1]->{"value"};
my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
- return if !$table;
+ return if !defined $table;
$config{"ignore_words"} = $table;
} else {
warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n";
@@ -1300,7 +1314,7 @@ sub replace {
}
foreach (@$substrings) {
- print $outputfh $_->[1];
+ print $outputfh NFC($_->[1]);
}
}
close $fh;
@@ -1796,6 +1810,21 @@ cannot currently think of any reason why someone would want to load a file both
with and without B<revert> in the same config, but I still wanted to add this
warning just in case.
+=item B<choiceoverride> <table path>
+
+Reads the mapping in the table at C<< <table path> >> and uses it to override
+the choice mechanism.
+
+The table contains a mapping of choices (separated by B<choicesep>) to single
+replacements. This was added to help in Urdu<->Hindi transliteration with the
+same database, since sometimes words with and without diacritics that actually
+mean the same thing are added for one direction but should default to one of
+them in the other direction.
+
+Note that this does not sort the choices before comparison and they have to
+be matched exactly, so when a new choice is added, that needs to be added to
+this mapping as well, in exactly the same order.
+
=item B<expand> <table identifier> <word ending table> [noroot]
Expand the table C<< <table identifier> >>, i.e. generate all the word forms using