transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 92c2f7acbbef5624803f1d296dce093edbe13dd3
parent f5414cb7c164023b787fa56d4ab59313df8961e1
Author: lumidify <nobody@lumidify.org>
Date:   Thu,  2 Apr 2020 10:10:55 +0200

Add targetdiacritics option

Diffstat:
Mtransliterate.pl | 35++++++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/transliterate.pl b/transliterate.pl @@ -379,6 +379,19 @@ sub prompt_choose_word { my $word = $replacements[$cur_replacement]->[1]; $wordlabel->set_text("Word \"$word\" has multiple replacement options:"); my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1]; + if (exists $config->{"targetdiacritics"}) { + # This nasty bit of code finds the number of diacritics in every + # choice and sorts the choice in descending order based on that + my %choice_nums; + foreach my $choice (@choices) { + $choice_nums{$choice} = 0; + foreach my $diacritic (@{$config->{"targetdiacritics"}}) { + my @matches = NFD($choice) =~ /$diacritic/; + $choice_nums{$choice} += scalar @matches if @matches; + } + } + @choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices; + } foreach my $word_choice (@choices) { my $button = Gtk2::Button->new($word_choice); $button->signal_connect( @@ -698,7 +711,8 @@ sub interpret_config { "choicesep" => [$STRING], "group" => [], "endgroup" => [], - "diacritics" => [$STRING] + "diacritics" => [$STRING], + "targetdiacritics" => [$STRING] ); my $in_group = 0; foreach my $cmd (@$config_list) { @@ -825,6 +839,13 @@ sub interpret_config { foreach (1..$#$cmd) { push @{$config{"diacritics"}}, $cmd->[$_]->{"value"}; } + } elsif ($cmd->[0]->{"value"} eq "targetdiacritics") { + if (!exists $config{"targetdiacritics"}) { + $config{"targetdiacritics"} = []; + } + foreach (1..$#$cmd) { + push @{$config{"targetdiacritics"}}, $cmd->[$_]->{"value"}; + } } elsif ($cmd->[0]->{"value"} eq "split") { $config{"split"} = $cmd->[1]->{"value"}; } elsif ($cmd->[0]->{"value"} eq "beforeword") { @@ -1858,6 +1879,18 @@ There are quite advanced Unicode algorithms that could be used to compare words while ignoring diacritics, but I do not know if it would be possible to use any of those with the current way this engine works. +=item B<targetdiacritics> <diacritic> [...] + +This was only added to simplify transliteration from Hindi to Urdu with the +same database. When this is set, the choices in the +L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order +based on the number of diacritics from this list that are matched in each +choice. This is so that when transliterating from Hindi to Urdu, the choice +with the most diacritics is always at the top. + +The attentive reader will notice at this point that most of the features +in this program were added specifically for dealing with Urdu and Hindi. + =back =head1 BUGS