transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 9766b48d4a2f3e1350b3d9223148f95c437fc78f
parent be38d9c29ac0a5622e4a7e1a2c7ee10d50b37c84
Author: lumidify <nobody@lumidify.org>
Date:   Tue,  7 Apr 2020 13:21:22 +0200

Replace diacritics with retrywithout

Diffstat:
Mtests/test5/config | 5+++--
Mtests/test5/input.txt | 1+
Mtransliterate.pl | 121+++++++++++++++++++++++++++++++++++++++++--------------------------------------
3 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/tests/test5/config b/tests/test5/config @@ -10,11 +10,12 @@ table endings "../data/endings_choices.txt" expand words endings -match "w ord" "word" beginword nofinal +#match "w ord" "word" beginword nofinal matchignore "-d" endword group beginword endword replace words endgroup -diacritics "̈" +retrywithout diacritics "̈" +retrywithout space " " diff --git a/tests/test5/input.txt b/tests/test5/input.txt @@ -1,4 +1,5 @@ ignore +w ord0 wörd0 wörd0 word1end1 wörd1end1 -dword9end2 word9end2-d diff --git a/transliterate.pl b/transliterate.pl @@ -95,15 +95,15 @@ sub prompt_unknown_word { $window->signal_connect(destroy => sub { Gtk2->main_quit; }); $window->set_border_width(10); - my $vbox = Gtk2::VBox->new(FALSE, 0); + my $vbox = Gtk2::VBox->new(FALSE, 10); my $linelabel = Gtk2::Label->new("Current line: $cur_lineno"); - $vbox->pack_start($linelabel, FALSE, FALSE, 10); + $vbox->pack_start($linelabel, FALSE, FALSE, 0); $linelabel->show; my $wordlabel = Gtk2::Label->new("Word not found: $word"); $wordlabel->set_alignment(0.0, 0.0); - $vbox->pack_start($wordlabel, FALSE, FALSE, 10); + $vbox->pack_start($wordlabel, FALSE, FALSE, 0); $wordlabel->show; # Make a text box with the given left and right context and label @@ -112,7 +112,7 @@ sub prompt_unknown_word { # of the entire word that was not found has to be replaced my $make_context_box = sub { my ($ctxtl, $ctxtr, $lbl) = @_; - my $hbox = Gtk2::HBox->new(FALSE, 0); + my $hbox = Gtk2::HBox->new(FALSE, 5); my $label = Gtk2::Label->new($lbl); my $text = Gtk2::TextView->new; $text->set_wrap_mode("word"); @@ -132,16 +132,16 @@ sub prompt_unknown_word { } }, $window); $hbox->pack_start($label, FALSE, FALSE, 0); - $hbox->pack_start($text, TRUE, TRUE, 10); - $vbox->pack_start($hbox, FALSE, FALSE, 10); - $hbox = Gtk2::HBox->new(FALSE, 0); + $hbox->pack_start($text, TRUE, TRUE, 0); + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox = Gtk2::HBox->new(FALSE, 5); $hbox->pack_start($button, FALSE, FALSE, 0); $vbox->pack_start($hbox, FALSE, FALSE, 0); }; $make_context_box->($contextl, $contextr, "Context: "); $make_context_box->($contextl_orig, $contextr_orig, "Original: "); - my $hbox = Gtk2::HBox->new(FALSE, 0); + my $hbox = Gtk2::HBox->new(FALSE, 5); my $label = Gtk2::Label->new("Ignore: "); $hbox->pack_start($label, FALSE, FALSE, 0); my $button = Gtk2::Button->new("This run"); @@ -157,14 +157,14 @@ sub prompt_unknown_word { $action = ["ignore", "permanent", $word]; $window->destroy; }, $window); - $hbox->pack_start($button, FALSE, FALSE, 5); - $vbox->pack_start($hbox, FALSE, FALSE, 10); + $hbox->pack_start($button, FALSE, FALSE, 0); + $vbox->pack_start($hbox, FALSE, FALSE, 0); # AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE! # Take note, kids - this is what happens when you keep adding # features without rethinking your basic design. - $hbox = Gtk2::HBox->new(FALSE, 0); + $hbox = Gtk2::HBox->new(FALSE, 5); $label = Gtk2::Label->new("Add to list: "); $hbox->pack_start($label, FALSE, FALSE, 0); my $path_list = Gtk2::ComboBox->new_text; @@ -172,25 +172,25 @@ sub prompt_unknown_word { $path_list->append_text($path); } $hbox->pack_start($path_list, FALSE, FALSE, 0); - $vbox->pack_start($hbox, FALSE, FALSE, 10); + $vbox->pack_start($hbox, FALSE, FALSE, 0); - $hbox = Gtk2::HBox->new(FALSE, 0); + $hbox = Gtk2::HBox->new(FALSE, 5); $label = Gtk2::Label->new("Replacement: "); $hbox->pack_start($label, FALSE, FALSE, 0); my $replace_entry = Gtk2::Entry->new; $hbox->pack_start($replace_entry, TRUE, TRUE, 0); $vbox->pack_start($hbox, FALSE, FALSE, 0); - $hbox = Gtk2::HBox->new(FALSE, 0); - if (exists $config->{"diacritics"}) { - $button = Gtk2::Button->new("Retry without diacritics"); + $hbox = Gtk2::HBox->new(FALSE, 5); + foreach my $without (@{$config->{"retrywithout"}}) { + $button = Gtk2::Button->new("Retry without $without->[0]"); $button->signal_connect( clicked => sub { - my $stripped = replace_strip_diacritics($config, $word); + my @chars = @{$without}[1..$#$without]; + my $stripped = replace_strip_chars($config, \@chars, $word); + # recombine substrings my $repl_text = ""; - foreach (@$stripped) { - $repl_text .= $_->[1]; - } + $repl_text .= $_->[1] foreach @$stripped; $replace_entry->set_text($repl_text); }, $window); $hbox->pack_start($button, FALSE, FALSE, 0); @@ -204,10 +204,10 @@ sub prompt_unknown_word { $window->destroy; } }, $window); - $hbox->pack_start($button, FALSE, FALSE, 5); - $vbox->pack_start($hbox, FALSE, FALSE, 5); + $hbox->pack_start($button, FALSE, FALSE, 0); + $vbox->pack_start($hbox, FALSE, FALSE, 0); - $hbox = Gtk2::HBox->new(FALSE, 0); + $hbox = Gtk2::HBox->new(FALSE, 5); $button = Gtk2::Button->new("Stop processing"); $button->signal_connect( clicked => sub { @@ -222,13 +222,13 @@ sub prompt_unknown_word { $action = ["reload"]; $window->destroy; }, $window); - $hbox->pack_start($button, FALSE, FALSE, 5); + $hbox->pack_start($button, FALSE, FALSE, 0); if ($config_error) { $label = Gtk2::Label->new("Error loading config; see terminal output for details"); $hbox->pack_start($label, FALSE, FALSE, 0); } - $vbox->pack_start($hbox, FALSE, FALSE, 5); + $vbox->pack_start($hbox, FALSE, FALSE, 0); $window->add($vbox); $window->show_all; @@ -683,6 +683,7 @@ sub interpret_config { # a list of "replacement configs", which specify the type and any # other arguments (this is given to replace_match, etc. $config{"replacements"} = []; + $config{"retrywithout"} = []; # these are temporary mappings used while loading the config my %path_to_table; my %table_id_to_path; @@ -700,7 +701,7 @@ sub interpret_config { "choicesep" => [$STRING], "group" => [], "endgroup" => [], - "diacritics" => [$STRING], + "retrywithout" => [$ID, $STRING], "targetdiacritics" => [$STRING] ); my $in_group = 0; @@ -824,7 +825,11 @@ sub interpret_config { my $trie_root = $config{"replacements"}->[-1]->{"words"}; my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override"; add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override); - } elsif ($cmd_name eq "diacritics" || $cmd_name eq "targetdiacritics") { + } elsif ($cmd_name eq "retrywithout") { + # first value is the display name + my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd]; + push @{$config{"retrywithout"}}, \@values; + } elsif ($cmd_name eq "targetdiacritics") { if (!exists $config{$cmd_name}) { $config{$cmd_name} = []; } @@ -1178,11 +1183,11 @@ sub replace_group { } # Perform all replacements on $word, first removing all -# diacritics specified in the config -sub replace_strip_diacritics { - my ($config, $word) = @_; - foreach my $diacritic (@{$config->{"diacritics"}}) { - $word =~ s/$diacritic//g; +# characters specified in $chars +sub replace_strip_chars { + my ($config, $chars, $word) = @_; + foreach my $char (@$chars) { + $word =~ s/\Q$char\E//g; } return replace_line($config, $word); } @@ -1543,9 +1548,10 @@ The possible actions are: "Permanently" saves the word in the ignore file specified in the configuration. -=item Retry without diacritics +=item Retry without <display name> -Removes all diacritics specified in the L<config|/"CONFIGURATION"> +Removes all characters specified in the corresponding B<retrywithout> +statement in the L<config|/"CONFIGURATION"> from the currently selected word and re-transliterates just that word. The result is then pasted into the text box beside "Add replacement" so it can be added to a table. This is only a @@ -1555,7 +1561,7 @@ diacritics is already in the tables, this button can be used to quickly find the transliteration instead of having to type it out again. Any part of the word that couldn't be transliterated is just pasted verbatim into the text box (but after the -diacritics have been removed). +characters have been removed). Note that the selection can still be modified after this, before pressing "Add to list". This could potentially be useful if a word @@ -1566,8 +1572,8 @@ but only the stem should be added to the list. If that is the case, selected, but the ending could be removed before actually pressing "Add to list". -This is only shown if there are any diacritics specified in the -config file. +A separate button is shown for every B<retrywithout> statement +in the config. =item Add to list @@ -1814,20 +1820,6 @@ cannot currently think of any reason why someone would want to load a file both with and without B<revert> in the same config, but I still wanted to add this warning just in case. -Like B<targetdiacritics>, this has no effect if B<--nochoices> is set. - -To clarify the order in which choices are added (if they are not explicitly -specified): - -When a word has multiple replacements at different places in the same table -file, the later occurrence is appended to the earlier one. -When a word has multiple replacements in different tables that are replaced -within the same B<group>, the occurrence in the later B<replace> statement -is appended to the earlier one. - -The special sorting by number of diacritics when B<targetdiacritics> is -specificed does not affect the order used to find replacements in B<choiceoverride>. - =item B<expand> <table identifier> <word ending table> [noroot] Expand the table C<< <table identifier> >>, i.e. generate all the word forms using @@ -1894,18 +1886,31 @@ Note that a table must have been loaded before being used in a B<replace> statem End a replacement group. -=item B<diacritics> <diacritic> [...] +=item B<retrywithout> <display name> <character> [...] -Adds the given list of diacritics to the list of diacritics that will be removed -from a word when "Retry without diacritics" is pressed in the -L<unknown word window|/"UNKNOWN WORD WINDOW">. +Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the +replacements on the selected word, first removing the given characters. +The button is named "Retry without <display name>". Whatever is found with the +replacements is pasted into the regular text box for the "Add replacement" +functionality. + +This can be used as an aid when, for instance, words can be written with or without +certain diacritics. If the actual word without diacritics is already in the +database and there is a B<retrywithout> statement for all the diacritics, the +button can be used to quickly find the replacement for the word instead of having +to type it out manually. The same goes for compound words that can be written +with or without a space. Note that all input text is first normalized to the unicode canonical decomposition form so that diacritics can be removed individually. -There are quite advanced Unicode algorithms that could be used to compare words -while ignoring diacritics, but I do not know if it would be possible to use any -of those with the current way this engine works. +Also note that all buttons are currently just dumped in the GUI without any +sort of wrapping, so they'll run off the screen if there are too many. +Tell me if this becomes a problem. I'm just too lazy to change it right now. + +Historical note: This was called B<diacritics> in a previous version and only +allowed removal of diacritics. This is exactly the same functionality, just +generalized to allow removal of any characters with different buttons. =item B<targetdiacritics> <diacritic> [...]