transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 229f872be7b4a68a74d1c9924845438968190809
parent 49357ed325d344e830564abdaacb61976e1fca00
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 31 Mar 2020 17:40:10 +0200

Modify diacritic handling

Diffstat:
Mtests/data/words1.txt | 1+
Mtransliterate.pl | 78+++++++++++++++++++++++++++++++++++++-----------------------------------------
2 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/tests/data/words1.txt b/tests/data/words1.txt @@ -8,3 +8,4 @@ word6,word6_replaced word7,word7_replaced word8,word8_replaced word9,word9_replaced +wörd0,word0_replaced|word0_replaced2 diff --git a/transliterate.pl b/transliterate.pl @@ -162,33 +162,45 @@ sub prompt_unknown_word { $vbox->pack_start($hbox, FALSE, FALSE, 10); $hbox->show; + # AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE! + # Take note, kids - this is what happens when you keep adding + # features without rethinking your basic design. + + # declare this here so it can already be used + my $path_list; if (exists $config->{"diacritics"}) { $hbox = Gtk2::HBox->new(FALSE, 0); - my $no_diacritic_label; - my $stripped; - my $accept_button = Gtk2::Button->new("Accept?"); + my $orig_entry; + my $repl_entry; + my $accept_button = Gtk2::Button->new("Add to table"); $accept_button->signal_connect( clicked => sub { - $action = ["replace", @$stripped]; - $window->destroy; + if ($path_list->get_active != -1) { + $action = ["add", $orig_entry->get_text, $repl_entry->get_text, $path_list->get_active_text]; + $window->destroy; + } }, $window); $button = Gtk2::Button->new("Retry without diacritics"); $button->signal_connect( clicked => sub { - $stripped = replace_strip_diacritics($config, $word); - my $tmp = ""; + my $stripped = replace_strip_diacritics($config, $word); + my $repl_text = ""; foreach (@$stripped) { - $tmp .= $_->[1]; + $repl_text .= $_->[1]; } - $no_diacritic_label->set_text($tmp); + $repl_entry->set_text($repl_text); + $orig_entry->set_text($word); + $repl_entry->show; + $orig_entry->show; $accept_button->show; }, $window); $hbox->pack_start($button, FALSE, FALSE, 0); $button->show; $hbox->pack_start($accept_button, FALSE, FALSE, 0); - $no_diacritic_label = Gtk2::Label->new(""); - $hbox->pack_start($no_diacritic_label, FALSE, FALSE, 10); - $no_diacritic_label->show; + $orig_entry = Gtk2::Entry->new; + $repl_entry = Gtk2::Entry->new; + $hbox->pack_start($orig_entry, TRUE, TRUE, 10); + $hbox->pack_start($repl_entry, TRUE, TRUE, 10); $vbox->pack_start($hbox, FALSE, FALSE, 0); $hbox->show; } @@ -197,7 +209,7 @@ sub prompt_unknown_word { $label = Gtk2::Label->new("Add to list: "); $hbox->pack_start($label, FALSE, FALSE, 0); $label->show; - my $path_list = Gtk2::ComboBox->new_text; + $path_list = Gtk2::ComboBox->new_text; foreach my $path (sort keys %{$config->{"display_tables"}}) { $path_list->append_text($path); } @@ -896,7 +908,7 @@ sub load_config { # 1 - the current line needs to be re-transliterated with the new config # 2 - an error occurred while reloading the config sub handle_unknown_word_action { - my ($substrings, $index, $action, $config, $args) = @_; + my ($action, $config, $args) = @_; if ($action->[0] eq "ignore") { $config->{"ignore_words"}->{$action->[2]} = ""; if ($action->[1] eq "permanent") { @@ -954,10 +966,7 @@ sub handle_unknown_word_action { } else { return 2; } - } elsif ($action->[0] eq "replace") { - splice @$substrings, $index, 1, @{$action}[1..$#$action]; } - return 0; } # Split $substrings based on the "split" regex in $config. @@ -1226,14 +1235,11 @@ sub replace_line { # $args - the command line args # $cur_lineno - display string to show the user the current line number # Returns: -# -1 - all done -# anything else - the substrings must be replaced again starting at the returned index +# 1 - the line needs to be re-transliterated +# 0 - all done sub get_unknown_words { my ($substrings, $config, $args, $cur_lineno) = @_; - my $i = 0; - # this is done so $substrings can be modified during the loop - # (instead of just foreach) - while ($i <= $#$substrings) { + foreach my $i (0 .. $#$substrings) { my $word = $substrings->[$i]; if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) { my $contextl = ""; @@ -1254,23 +1260,21 @@ sub get_unknown_words { ); # if $ret == 2, config could not be loaded # if $ret == 1, line must be redone with new config - my $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args); + my $ret = handle_unknown_word_action($action, $config, $args); # keep retrying until the user chooses an action which # didn't throw an error while ($ret == 2) { $action = prompt_unknown_word($contextl, $contextl_orig, $word->[1], $contextr, $contextr_orig, $config, "$cur_lineno", 1); - $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args); + $ret = handle_unknown_word_action($action, $config, $args); } # re-transliterate the line with the new config - if ($ret == 1) { - return $i; - } + return 1 if $ret == 1; } $i++; } - return -1; + return 0; } # Main replacement function @@ -1292,18 +1296,8 @@ sub replace { if (!$args->{"nounknowns"}) { # re-transliterate the string if the config was reloaded - my $start_index = 0; - while ($start_index != -1) { - $start_index = get_unknown_words($substrings, $config, $args, "$./$total_lines", $start_index); - if ($start_index != -1) { - my $str = ""; - foreach ($start_index..$#$substrings) { - $str .= $substrings->[$_]->[2]; - } - my $new_substrings = replace_line($config, $str); - splice @$substrings, $start_index; - push @$substrings, @$new_substrings; - } + while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) { + $substrings = replace_line($config, $nfd_line); } } elsif ($args->{"debug"}) { foreach my $s (@$substrings) { @@ -1849,6 +1843,8 @@ on-the-fly replacing doesn't work. In general, I have tested the GUI code much less than the rest since you can't really test it automatically very well. +The diacritic handling code is very rudimentary. + Tell me if you find any bugs. =head1 SEE ALSO