transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit b174c1557b4acae8b3f30c0469d7b8f26c27bb8b
parent c78c54b0fbc4a84c11673206c0a1fb6526cd6136
Author: lumidify <nobody@lumidify.org>
Date:   Fri, 17 Apr 2020 09:50:23 +0200

Add debugging option for match and group statements

Diffstat:
Mtests/runtest.sh | 2+-
Mtransliterate.pl | 116++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------
2 files changed, 87 insertions(+), 31 deletions(-)

diff --git a/tests/runtest.sh b/tests/runtest.sh @@ -1,6 +1,6 @@ #!/bin/sh -../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1 +../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debugspecial "$1/input.txt" > runtest_err.txt 2>&1 diff "$1/expected.txt" runtest.txt > /dev/null err1=$? diff "$1/err.txt" runtest_err.txt > /dev/null diff --git a/transliterate.pl b/transliterate.pl @@ -89,7 +89,10 @@ sub add_to_trie { # in the form ["<action name>", <optional args>]. # See `handle_unknown_word_action` for currently accepted values sub prompt_unknown_word { - my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_; + # yes, this function really should take fewer arguments... + # it would be better to just pass the substrings and an index + my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig, + $config, $cur_lineno, $args, $config_error) = @_; my $action; my $stop = 0; @@ -208,7 +211,7 @@ sub prompt_unknown_word { $button->signal_connect( clicked => sub { my @chars = @{$without}[1..$#$without]; - my $stripped = replace_strip_chars($config, \@chars, $word); + my $stripped = replace_strip_chars($config, $args, \@chars, $word); # recombine substrings my $repl_text = ""; $repl_text .= $_->[1] foreach @$stripped; @@ -827,9 +830,12 @@ sub interpret_config { warn "ERROR: New group started without ending last one in config\n"; return; } - push @{$config{"replacements"}}, {"type" => "group", "words" => {}}; + push @{$config{"replacements"}}, { + "type" => "group", "tables" => [], + "words" => {}, "options" => {}}; + # add all options such as "endword" to the options hash for (1..$#$cmd) { - $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; } $in_group = 1; } elsif ($cmd_name eq "endgroup") { @@ -845,20 +851,24 @@ sub interpret_config { } push @{$config{"replacements"}}, { "type" => "match", + "options" => {}, "search" => NFD($cmd->[1]->{"value"}), "replace" => $cmd->[2]->{"value"}}; for (3..$#$cmd) { - # add optional arguments as keys in replacement config - $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + # add optional arguments as keys in options hash + $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; } } elsif ($cmd_name eq "matchignore") { if ($in_group) { warn "ERROR: matchignore command is invalid inside group\n"; return; } - push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})}; + push @{$config{"replacements"}}, { + "type" => "match", + "options" => {}, + "search" => NFD($cmd->[1]->{"value"})}; for (2..$#$cmd) { - $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; } } elsif ($cmd_name eq "replace") { if (!$in_group) { @@ -879,6 +889,9 @@ sub interpret_config { $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path}; push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table]; + # store list of tables for --debug + push @{$config{"replacements"}->[-1]->{"tables"}}, $table; + # Note: we don't need to check if $table{"choicesep"} was defined # here since we can't ever get this far without first having # loaded a table anyways @@ -1080,9 +1093,9 @@ sub push_unknown { # specifies if the match is only valid when $config->{"beforeword"} # or $config->{"afterword"} occur before or after it, respectively sub replace_match { - my ($config, $replace_config, $substrings) = @_; - my $beginword = exists $replace_config->{"beginword"}; - my $endword = exists $replace_config->{"endword"}; + my ($config, $args, $replace_config, $substrings, $debug_msg) = @_; + my $beginword = exists $replace_config->{"options"}->{"beginword"}; + my $endword = exists $replace_config->{"options"}->{"endword"}; my $fullword = $beginword && $endword; my $beforeword = $config->{"beforeword"}; my $afterword = $config->{"afterword"}; @@ -1118,6 +1131,9 @@ sub replace_match { my $i1 = 0; while ($substrings->[$i]->[1] =~ m/$word/g) { if (!$found_word) { + if ($args->{"debug"}) { + print $debug_msg; + } $found_word = 1; if ($i != 0) { push(@substrings_new, @{$substrings}[0..$i-1]); @@ -1146,9 +1162,15 @@ sub replace_match { } my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0); my $replace_str = $replace_word // $orig_str; - if ($replace_config->{"nofinal"}) { + if ($replace_config->{"options"}->{"nofinal"}) { + if ($args->{"debug"}) { + warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n"; + } push_unknown \@substrings_new, $orig_str, $replace_str; } else { + if ($args->{"debug"}) { + warn "Replaced \"$orig_str\" with \"$replace_str\"\n"; + } push(@substrings_new, [1, $replace_str, $orig_str]); } $last_idx = $i1; @@ -1168,8 +1190,9 @@ sub replace_match { # $replace_config->{"beginword"}, $replace_config->{"endword"} - # same as in `replace_match` sub replace_group { - my ($config, $replace_config, $substrings) = @_; + my ($config, $args, $replace_config, $substrings, $debug_msg) = @_; my @substrings_new; + my $anything_replaced = 0; # Recurse backwords towards the root node of the trie to find the first # node with a key "final" which satisfies the ending condition (if "endword" is set) # Returns the id *after* the last match and the node that was found @@ -1178,7 +1201,7 @@ sub replace_group { my ($i, $tmp_cur_node, $s) = @_; do { my $after_ch = substr($s->[1], $i, 1); - if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) || + if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) || $after_ch eq "" || $after_ch =~ $config->{"afterword"})) { return ($i, $tmp_cur_node); } @@ -1207,7 +1230,7 @@ sub replace_group { if (exists $cur_node->{$ch}) { if ($cur_node == $replace_config->{"words"}) { my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : ""; - if (exists($replace_config->{"beginword"}) && + if (exists($replace_config->{"options"}->{"beginword"}) && $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) { push_unknown \@substrings_new, $ch; $i++; @@ -1223,7 +1246,16 @@ sub replace_group { push_unknown \@substrings_new, substr($s->[1], $i + 1, 1); $i += 2; } else { - push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]); + my $orig = substr($s->[1], $start_i, $i-$start_i); + my $final = $tmp_cur_node->{"final"}; + if ($args->{"debug"}) { + if (!$anything_replaced) { + warn $debug_msg; + $anything_replaced = 1; + } + warn "Replaced \"$orig\" with \"$final\"\n"; + } + push(@substrings_new, [1, $final, $orig]); } $cur_node = $replace_config->{"words"}; next; @@ -1239,11 +1271,11 @@ sub replace_group { # Perform all replacements on $word, first removing all # characters specified in $chars sub replace_strip_chars { - my ($config, $chars, $word) = @_; + my ($config, $args, $chars, $word) = @_; foreach my $char (@$chars) { $word =~ s/\Q$char\E//g; } - return replace_line($config, $word); + return replace_line($config, $args, $word); } # Perform all replacements on $line based on $config @@ -1253,13 +1285,29 @@ sub replace_strip_chars { # transliterated string, and lastly the original string. # If the first element is 0, the second two elements are obviously same sub replace_line { - my ($config, $line) = @_; + my ($config, $args, $line) = @_; my $substrings = [[0, $line, $line]]; foreach my $replacement (@{$config->{"replacements"}}) { if ($replacement->{"type"} eq "match") { - replace_match($config, $replacement, $substrings); + my $debug_msg; + if ($args->{"debug"}) { + my $options = join " ", keys(%{$replacement->{"options"}}); + $debug_msg = "Match ($options): \"$replacement->{search}\""; + if ($replacement->{"replace"}) { + $debug_msg .= " \"$replacement->{replace}\"\n"; + } else { + $debug_msg .= " (ignore)\n"; + } + } + replace_match($config, $args, $replacement, $substrings, $debug_msg); } elsif ($replacement->{"type"} eq "group") { - replace_group($config, $replacement, $substrings); + my $debug_msg; + if ($args->{"debug"}) { + my $options = join " ", keys(%{$replacement->{"options"}}); + my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"'; + $debug_msg = "Group ($options): $tables\n"; + } + replace_group($config, $args, $replacement, $substrings, $debug_msg); } } # splits all words at the end so that the splitting characters @@ -1294,8 +1342,7 @@ sub call_unknown_word_window { } my $action = prompt_unknown_word($contextl, $contextl_orig, $word->[1], $word->[2], $contextr, $contextr_orig, - $config, "$cur_lineno" - ); + $config, "$cur_lineno", $args); # if $ret == 3, rest of line should be skipped # if $ret == 2, config could not be loaded # if $ret == 1, line must be redone with new config @@ -1305,7 +1352,7 @@ sub call_unknown_word_window { while ($ret == 2) { $action = prompt_unknown_word($contextl, $contextl_orig, $word->[1], $word->[2], $contextr, $contextr_orig, - $config, "$cur_lineno", 1); + $config, "$cur_lineno", $args, 1); $ret = handle_unknown_word_action($action, $config, $args); } return $ret; @@ -1359,14 +1406,14 @@ sub replace { $comment = $1; } my $nfd_line = NFD($line); - my $substrings = replace_line($config, $nfd_line); + my $substrings = replace_line($config, $args, $nfd_line); if (!$args->{"nounknowns"}) { # re-transliterate the string if the config was reloaded while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) { - $substrings = replace_line($config, $nfd_line); + $substrings = replace_line($config, $args, $nfd_line); } - } elsif ($args->{"debug"}) { + } elsif ($args->{"debugspecial"}) { foreach my $s (@$substrings) { if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) { warn "Unknown word: \"$s->[1]\"\n"; @@ -1378,10 +1425,10 @@ sub replace { # "Open in unknown word window" while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) { if ($ret == 1) { - $substrings = replace_line($config, $nfd_line); + $substrings = replace_line($config, $args, $nfd_line); } } - } elsif ($args->{"debug"}) { + } elsif ($args->{"debugspecial"}) { foreach my $s (@$substrings) { if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) { my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1]; @@ -1399,7 +1446,7 @@ sub replace { my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => ""); GetOptions( - \%args, "debug", + \%args, "debug", "debugspecial", "nochoices", "nounknowns", "force", "start=i", "output=s", "config=s", @@ -1542,6 +1589,15 @@ prompts. =item B<--debug> +Prints information helpful for debugging problems with the B<match> and B<group> +statements. + +For each B<match> or B<group> statement which replaces anything, the original +statement is printed (the format is a bit different than in the config) and +each actual word that's replaced is printed. + +=item B<--debugspecial> + This option is only useful for automatic testing of the transliteration engine. If B<--nochoices> is enabled, each word in the input with multiple choices will