transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit ec431a30af78b0cc2f936c8064cb57acecff5414
parent 1679620c311e9d20a76f682e3207e9ba65d03b36
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 31 Mar 2020 15:48:27 +0200

Add diacritic handling; documentation not updated yet

Diffstat:
Atests/test5/config | 20++++++++++++++++++++
Atests/test5/descr.txt | 1+
Atests/test5/err.txt | 3+++
Atests/test5/expected.txt | 3+++
Atests/test5/input.txt | 3+++
Mtransliterate.pl | 94++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
6 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/tests/test5/config b/tests/test5/config @@ -0,0 +1,20 @@ +split "[ \n]+" +beforeword " " +afterword "[ \n]" +tablesep "," +choicesep "|" + +ignore "../data/ignore.txt" +table words "../data/words1.txt" +table endings "../data/endings_choices.txt" + +expand words endings + +match "w ord" "word" beginword nofinal +matchignore "-d" endword + +group beginword endword +replace words +endgroup + +diacritics "̈" diff --git a/tests/test5/descr.txt b/tests/test5/descr.txt @@ -0,0 +1 @@ +matchignore only endword; expand noroot diff --git a/tests/test5/err.txt b/tests/test5/err.txt @@ -0,0 +1,3 @@ +Unknown word: "word0" +Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices. +Unknown word: "-dword9end2" diff --git a/tests/test5/expected.txt b/tests/test5/expected.txt @@ -0,0 +1,3 @@ +ignore +word0 word1_replacedend1r1|word1_replacedend1r2 +-dword9end2 word9_replacedend2r-d diff --git a/tests/test5/input.txt b/tests/test5/input.txt @@ -0,0 +1,3 @@ +ignore +wörd0 word1end1 +-dword9end2 word9end2-d diff --git a/transliterate.pl b/transliterate.pl @@ -79,7 +79,7 @@ sub add_to_trie { # in the form ["<action name>", <optional args>]. # See `handle_unknown_word_action` for currently accepted values sub prompt_unknown_word { - my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_; + my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_; my $action; my $stop = 0; @@ -162,12 +162,43 @@ sub prompt_unknown_word { $vbox->pack_start($hbox, FALSE, FALSE, 10); $hbox->show; + if (exists $config->{"diacritics"}) { + $hbox = Gtk2::HBox->new(FALSE, 0); + my $no_diacritic_label; + my $stripped; + my $accept_button = Gtk2::Button->new("Accept?"); + $accept_button->signal_connect( + clicked => sub { + $action = ["replace", @$stripped]; + $window->destroy; + }, $window); + $button = Gtk2::Button->new("Retry without diacritics"); + $button->signal_connect( + clicked => sub { + $stripped = replace_strip_diacritics($config, $word); + my $tmp = ""; + foreach (@$stripped) { + $tmp .= $_->[1]; + } + $no_diacritic_label->set_text($tmp); + $accept_button->show; + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $hbox->pack_start($accept_button, FALSE, FALSE, 0); + $no_diacritic_label = Gtk2::Label->new(""); + $hbox->pack_start($no_diacritic_label, FALSE, FALSE, 10); + $no_diacritic_label->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + } + $hbox = Gtk2::HBox->new(FALSE, 0); $label = Gtk2::Label->new("Add to list: "); $hbox->pack_start($label, FALSE, FALSE, 0); $label->show; my $path_list = Gtk2::ComboBox->new_text; - foreach my $path (sort keys %$table_paths) { + foreach my $path (sort keys %{$config->{"display_tables"}}) { $path_list->append_text($path); } $hbox->pack_start($path_list, FALSE, FALSE, 0); @@ -667,7 +698,8 @@ sub interpret_config { "tablesep" => [$STRING], "choicesep" => [$STRING], "group" => [], - "endgroup" => [] + "endgroup" => [], + "diacritics" => [$STRING] ); my $in_group = 0; foreach my $cmd (@$config_list) { @@ -787,6 +819,13 @@ sub interpret_config { # loaded a table anyways my $trie_root = $config{"replacements"}->[-1]->{"words"}; add_to_trie($table, $trie_root, $tables{$table}, $args, \%config); + } elsif ($cmd->[0]->{"value"} eq "diacritics") { + if (!exists $config{"diacritics"}) { + $config{"diacritics"} = []; + } + foreach (1..$#$cmd) { + push @{$config{"diacritics"}}, $cmd->[$_]->{"value"}; + } } elsif ($cmd->[0]->{"value"} eq "split") { $config{"split"} = $cmd->[1]->{"value"}; } elsif ($cmd->[0]->{"value"} eq "beforeword") { @@ -857,7 +896,7 @@ sub load_config { # 1 - the current line needs to be re-transliterated with the new config # 2 - an error occurred while reloading the config sub handle_unknown_word_action { - my ($action, $config, $args) = @_; + my ($substrings, $index, $action, $config, $args) = @_; if ($action->[0] eq "ignore") { $config->{"ignore_words"}->{$action->[2]} = ""; if ($action->[1] eq "permanent") { @@ -915,7 +954,10 @@ sub handle_unknown_word_action { } else { return 2; } + } elsif ($action->[0] eq "replace") { + splice @$substrings, $index, 1, @{$action}[1..$#$action]; } + return 0; } # Split $substrings into single words based on the "split" option @@ -1131,6 +1173,14 @@ sub replace_group { @$substrings = @substrings_new; } +sub replace_strip_diacritics { + my ($config, $word) = @_; + foreach my $diacritic (@{$config->{"diacritics"}}) { + $word =~ s/$diacritic//g; + } + return replace_line($config, $word); +} + # Perform all replacements on $line based on $config # $substrings: array of arrays - each one has three elements: # first 0 or 1, indicating if the substring following it has already @@ -1166,11 +1216,14 @@ sub replace_line { # $args - the command line args # $cur_lineno - display string to show the user the current line number # Returns: -# 1 - the current line must be replaced again with the new config -# 0 - everything's fine +# -1 - all done +# anything else - the substrings must be replaced again starting at the returned index sub get_unknown_words { my ($substrings, $config, $args, $cur_lineno) = @_; - foreach my $i (0..$#$substrings) { + my $i = 0; + # this is done so $substrings can be modified during the loop + # (instead of just foreach) + while ($i <= $#$substrings) { my $word = $substrings->[$i]; if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) { my $contextl = ""; @@ -1187,26 +1240,27 @@ sub get_unknown_words { } my $action = prompt_unknown_word($contextl, $contextl_orig, $word->[1], $contextr, $contextr_orig, - $config->{"display_tables"}, "$cur_lineno" + $config, "$cur_lineno" ); # if $ret == 2, config could not be loaded # if $ret == 1, line must be redone with new config - my $ret = handle_unknown_word_action($action, $config, $args); + my $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args); # keep retrying until the user chooses an action which # didn't throw an error while ($ret == 2) { $action = prompt_unknown_word($contextl, $contextl_orig, $word->[1], $contextr, $contextr_orig, - $config->{"display_tables"}, "$cur_lineno", 1); - $ret = handle_unknown_word_action($action, $config, $args); + $config, "$cur_lineno", 1); + $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args); } # re-transliterate the line with the new config if ($ret == 1) { - return 1; + return $i; } } + $i++; } - return 0; + return -1; } # Main replacement function @@ -1228,8 +1282,18 @@ sub replace { if (!$args->{"nounknowns"}) { # re-transliterate the string if the config was reloaded - while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) { - $substrings = replace_line($config, $nfd_line); + my $start_index = 0; + while ($start_index != -1) { + $start_index = get_unknown_words($substrings, $config, $args, "$./$total_lines", $start_index); + if ($start_index != -1) { + my $str = ""; + foreach ($start_index..$#$substrings) { + $str .= $substrings->[$_]->[2]; + } + my $new_substrings = replace_line($config, $str); + splice @$substrings, $start_index; + push @$substrings, @$new_substrings; + } } } elsif ($args->{"debug"}) { foreach my $s (@$substrings) {