commit ec431a30af78b0cc2f936c8064cb57acecff5414
parent 1679620c311e9d20a76f682e3207e9ba65d03b36
Author: lumidify <nobody@lumidify.org>
Date: Tue, 31 Mar 2020 15:48:27 +0200
Add diacritic handling; documentation not updated yet
Diffstat:
6 files changed, 109 insertions(+), 15 deletions(-)
diff --git a/tests/test5/config b/tests/test5/config
@@ -0,0 +1,20 @@
+split "[ \n]+"
+beforeword " "
+afterword "[ \n]"
+tablesep ","
+choicesep "|"
+
+ignore "../data/ignore.txt"
+table words "../data/words1.txt"
+table endings "../data/endings_choices.txt"
+
+expand words endings
+
+match "w ord" "word" beginword nofinal
+matchignore "-d" endword
+
+group beginword endword
+replace words
+endgroup
+
+diacritics "̈"
diff --git a/tests/test5/descr.txt b/tests/test5/descr.txt
@@ -0,0 +1 @@
+matchignore only endword; expand noroot
diff --git a/tests/test5/err.txt b/tests/test5/err.txt
@@ -0,0 +1,3 @@
+Unknown word: "word0"
+Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
+Unknown word: "-dword9end2"
diff --git a/tests/test5/expected.txt b/tests/test5/expected.txt
@@ -0,0 +1,3 @@
+ignore
+word0 word1_replacedend1r1|word1_replacedend1r2
+-dword9end2 word9_replacedend2r-d
diff --git a/tests/test5/input.txt b/tests/test5/input.txt
@@ -0,0 +1,3 @@
+ignore
+wörd0 word1end1
+-dword9end2 word9end2-d
diff --git a/transliterate.pl b/transliterate.pl
@@ -79,7 +79,7 @@ sub add_to_trie {
# in the form ["<action name>", <optional args>].
# See `handle_unknown_word_action` for currently accepted values
sub prompt_unknown_word {
- my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_;
+ my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_;
my $action;
my $stop = 0;
@@ -162,12 +162,43 @@ sub prompt_unknown_word {
$vbox->pack_start($hbox, FALSE, FALSE, 10);
$hbox->show;
+ if (exists $config->{"diacritics"}) {
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ my $no_diacritic_label;
+ my $stripped;
+ my $accept_button = Gtk2::Button->new("Accept?");
+ $accept_button->signal_connect(
+ clicked => sub {
+ $action = ["replace", @$stripped];
+ $window->destroy;
+ }, $window);
+ $button = Gtk2::Button->new("Retry without diacritics");
+ $button->signal_connect(
+ clicked => sub {
+ $stripped = replace_strip_diacritics($config, $word);
+ my $tmp = "";
+ foreach (@$stripped) {
+ $tmp .= $_->[1];
+ }
+ $no_diacritic_label->set_text($tmp);
+ $accept_button->show;
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $hbox->pack_start($accept_button, FALSE, FALSE, 0);
+ $no_diacritic_label = Gtk2::Label->new("");
+ $hbox->pack_start($no_diacritic_label, FALSE, FALSE, 10);
+ $no_diacritic_label->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+ }
+
$hbox = Gtk2::HBox->new(FALSE, 0);
$label = Gtk2::Label->new("Add to list: ");
$hbox->pack_start($label, FALSE, FALSE, 0);
$label->show;
my $path_list = Gtk2::ComboBox->new_text;
- foreach my $path (sort keys %$table_paths) {
+ foreach my $path (sort keys %{$config->{"display_tables"}}) {
$path_list->append_text($path);
}
$hbox->pack_start($path_list, FALSE, FALSE, 0);
@@ -667,7 +698,8 @@ sub interpret_config {
"tablesep" => [$STRING],
"choicesep" => [$STRING],
"group" => [],
- "endgroup" => []
+ "endgroup" => [],
+ "diacritics" => [$STRING]
);
my $in_group = 0;
foreach my $cmd (@$config_list) {
@@ -787,6 +819,13 @@ sub interpret_config {
# loaded a table anyways
my $trie_root = $config{"replacements"}->[-1]->{"words"};
add_to_trie($table, $trie_root, $tables{$table}, $args, \%config);
+ } elsif ($cmd->[0]->{"value"} eq "diacritics") {
+ if (!exists $config{"diacritics"}) {
+ $config{"diacritics"} = [];
+ }
+ foreach (1..$#$cmd) {
+ push @{$config{"diacritics"}}, $cmd->[$_]->{"value"};
+ }
} elsif ($cmd->[0]->{"value"} eq "split") {
$config{"split"} = $cmd->[1]->{"value"};
} elsif ($cmd->[0]->{"value"} eq "beforeword") {
@@ -857,7 +896,7 @@ sub load_config {
# 1 - the current line needs to be re-transliterated with the new config
# 2 - an error occurred while reloading the config
sub handle_unknown_word_action {
- my ($action, $config, $args) = @_;
+ my ($substrings, $index, $action, $config, $args) = @_;
if ($action->[0] eq "ignore") {
$config->{"ignore_words"}->{$action->[2]} = "";
if ($action->[1] eq "permanent") {
@@ -915,7 +954,10 @@ sub handle_unknown_word_action {
} else {
return 2;
}
+ } elsif ($action->[0] eq "replace") {
+ splice @$substrings, $index, 1, @{$action}[1..$#$action];
}
+ return 0;
}
# Split $substrings into single words based on the "split" option
@@ -1131,6 +1173,14 @@ sub replace_group {
@$substrings = @substrings_new;
}
+sub replace_strip_diacritics {
+ my ($config, $word) = @_;
+ foreach my $diacritic (@{$config->{"diacritics"}}) {
+ $word =~ s/$diacritic//g;
+ }
+ return replace_line($config, $word);
+}
+
# Perform all replacements on $line based on $config
# $substrings: array of arrays - each one has three elements:
# first 0 or 1, indicating if the substring following it has already
@@ -1166,11 +1216,14 @@ sub replace_line {
# $args - the command line args
# $cur_lineno - display string to show the user the current line number
# Returns:
-# 1 - the current line must be replaced again with the new config
-# 0 - everything's fine
+# -1 - all done
+# anything else - the substrings must be replaced again starting at the returned index
sub get_unknown_words {
my ($substrings, $config, $args, $cur_lineno) = @_;
- foreach my $i (0..$#$substrings) {
+ my $i = 0;
+ # this is done so $substrings can be modified during the loop
+ # (instead of just foreach)
+ while ($i <= $#$substrings) {
my $word = $substrings->[$i];
if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
my $contextl = "";
@@ -1187,26 +1240,27 @@ sub get_unknown_words {
}
my $action = prompt_unknown_word($contextl, $contextl_orig,
$word->[1], $contextr, $contextr_orig,
- $config->{"display_tables"}, "$cur_lineno"
+ $config, "$cur_lineno"
);
# if $ret == 2, config could not be loaded
# if $ret == 1, line must be redone with new config
- my $ret = handle_unknown_word_action($action, $config, $args);
+ my $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args);
# keep retrying until the user chooses an action which
# didn't throw an error
while ($ret == 2) {
$action = prompt_unknown_word($contextl, $contextl_orig,
$word->[1], $contextr, $contextr_orig,
- $config->{"display_tables"}, "$cur_lineno", 1);
- $ret = handle_unknown_word_action($action, $config, $args);
+ $config, "$cur_lineno", 1);
+ $ret = handle_unknown_word_action($substrings, $i, $action, $config, $args);
}
# re-transliterate the line with the new config
if ($ret == 1) {
- return 1;
+ return $i;
}
}
+ $i++;
}
- return 0;
+ return -1;
}
# Main replacement function
@@ -1228,8 +1282,18 @@ sub replace {
if (!$args->{"nounknowns"}) {
# re-transliterate the string if the config was reloaded
- while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
- $substrings = replace_line($config, $nfd_line);
+ my $start_index = 0;
+ while ($start_index != -1) {
+ $start_index = get_unknown_words($substrings, $config, $args, "$./$total_lines", $start_index);
+ if ($start_index != -1) {
+ my $str = "";
+ foreach ($start_index..$#$substrings) {
+ $str .= $substrings->[$_]->[2];
+ }
+ my $new_substrings = replace_line($config, $str);
+ splice @$substrings, $start_index;
+ push @$substrings, @$new_substrings;
+ }
}
} elsif ($args->{"debug"}) {
foreach my $s (@$substrings) {