commit b174c1557b4acae8b3f30c0469d7b8f26c27bb8b
parent c78c54b0fbc4a84c11673206c0a1fb6526cd6136
Author: lumidify <nobody@lumidify.org>
Date: Fri, 17 Apr 2020 09:50:23 +0200
Add debugging option for match and group statements
Diffstat:
2 files changed, 87 insertions(+), 31 deletions(-)
diff --git a/tests/runtest.sh b/tests/runtest.sh
@@ -1,6 +1,6 @@
#!/bin/sh
-../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1
+../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debugspecial "$1/input.txt" > runtest_err.txt 2>&1
diff "$1/expected.txt" runtest.txt > /dev/null
err1=$?
diff "$1/err.txt" runtest_err.txt > /dev/null
diff --git a/transliterate.pl b/transliterate.pl
@@ -89,7 +89,10 @@ sub add_to_trie {
# in the form ["<action name>", <optional args>].
# See `handle_unknown_word_action` for currently accepted values
sub prompt_unknown_word {
- my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig, $config, $cur_lineno, $config_error) = @_;
+ # yes, this function really should take fewer arguments...
+ # it would be better to just pass the substrings and an index
+ my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
+ $config, $cur_lineno, $args, $config_error) = @_;
my $action;
my $stop = 0;
@@ -208,7 +211,7 @@ sub prompt_unknown_word {
$button->signal_connect(
clicked => sub {
my @chars = @{$without}[1..$#$without];
- my $stripped = replace_strip_chars($config, \@chars, $word);
+ my $stripped = replace_strip_chars($config, $args, \@chars, $word);
# recombine substrings
my $repl_text = "";
$repl_text .= $_->[1] foreach @$stripped;
@@ -827,9 +830,12 @@ sub interpret_config {
warn "ERROR: New group started without ending last one in config\n";
return;
}
- push @{$config{"replacements"}}, {"type" => "group", "words" => {}};
+ push @{$config{"replacements"}}, {
+ "type" => "group", "tables" => [],
+ "words" => {}, "options" => {}};
+ # add all options such as "endword" to the options hash
for (1..$#$cmd) {
- $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
}
$in_group = 1;
} elsif ($cmd_name eq "endgroup") {
@@ -845,20 +851,24 @@ sub interpret_config {
}
push @{$config{"replacements"}}, {
"type" => "match",
+ "options" => {},
"search" => NFD($cmd->[1]->{"value"}),
"replace" => $cmd->[2]->{"value"}};
for (3..$#$cmd) {
- # add optional arguments as keys in replacement config
- $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ # add optional arguments as keys in options hash
+ $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
}
} elsif ($cmd_name eq "matchignore") {
if ($in_group) {
warn "ERROR: matchignore command is invalid inside group\n";
return;
}
- push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})};
+ push @{$config{"replacements"}}, {
+ "type" => "match",
+ "options" => {},
+ "search" => NFD($cmd->[1]->{"value"})};
for (2..$#$cmd) {
- $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
}
} elsif ($cmd_name eq "replace") {
if (!$in_group) {
@@ -879,6 +889,9 @@ sub interpret_config {
$config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
+ # store list of tables for --debug
+ push @{$config{"replacements"}->[-1]->{"tables"}}, $table;
+
# Note: we don't need to check if $table{"choicesep"} was defined
# here since we can't ever get this far without first having
# loaded a table anyways
@@ -1080,9 +1093,9 @@ sub push_unknown {
# specifies if the match is only valid when $config->{"beforeword"}
# or $config->{"afterword"} occur before or after it, respectively
sub replace_match {
- my ($config, $replace_config, $substrings) = @_;
- my $beginword = exists $replace_config->{"beginword"};
- my $endword = exists $replace_config->{"endword"};
+ my ($config, $args, $replace_config, $substrings, $debug_msg) = @_;
+ my $beginword = exists $replace_config->{"options"}->{"beginword"};
+ my $endword = exists $replace_config->{"options"}->{"endword"};
my $fullword = $beginword && $endword;
my $beforeword = $config->{"beforeword"};
my $afterword = $config->{"afterword"};
@@ -1118,6 +1131,9 @@ sub replace_match {
my $i1 = 0;
while ($substrings->[$i]->[1] =~ m/$word/g) {
if (!$found_word) {
+ if ($args->{"debug"}) {
+ print $debug_msg;
+ }
$found_word = 1;
if ($i != 0) {
push(@substrings_new, @{$substrings}[0..$i-1]);
@@ -1146,9 +1162,15 @@ sub replace_match {
}
my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
my $replace_str = $replace_word // $orig_str;
- if ($replace_config->{"nofinal"}) {
+ if ($replace_config->{"options"}->{"nofinal"}) {
+ if ($args->{"debug"}) {
+ warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n";
+ }
push_unknown \@substrings_new, $orig_str, $replace_str;
} else {
+ if ($args->{"debug"}) {
+ warn "Replaced \"$orig_str\" with \"$replace_str\"\n";
+ }
push(@substrings_new, [1, $replace_str, $orig_str]);
}
$last_idx = $i1;
@@ -1168,8 +1190,9 @@ sub replace_match {
# $replace_config->{"beginword"}, $replace_config->{"endword"} -
# same as in `replace_match`
sub replace_group {
- my ($config, $replace_config, $substrings) = @_;
+ my ($config, $args, $replace_config, $substrings, $debug_msg) = @_;
my @substrings_new;
+ my $anything_replaced = 0;
# Recurse backwords towards the root node of the trie to find the first
# node with a key "final" which satisfies the ending condition (if "endword" is set)
# Returns the id *after* the last match and the node that was found
@@ -1178,7 +1201,7 @@ sub replace_group {
my ($i, $tmp_cur_node, $s) = @_;
do {
my $after_ch = substr($s->[1], $i, 1);
- if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) ||
+ if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
$after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
return ($i, $tmp_cur_node);
}
@@ -1207,7 +1230,7 @@ sub replace_group {
if (exists $cur_node->{$ch}) {
if ($cur_node == $replace_config->{"words"}) {
my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
- if (exists($replace_config->{"beginword"}) &&
+ if (exists($replace_config->{"options"}->{"beginword"}) &&
$before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
push_unknown \@substrings_new, $ch;
$i++;
@@ -1223,7 +1246,16 @@ sub replace_group {
push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
$i += 2;
} else {
- push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]);
+ my $orig = substr($s->[1], $start_i, $i-$start_i);
+ my $final = $tmp_cur_node->{"final"};
+ if ($args->{"debug"}) {
+ if (!$anything_replaced) {
+ warn $debug_msg;
+ $anything_replaced = 1;
+ }
+ warn "Replaced \"$orig\" with \"$final\"\n";
+ }
+ push(@substrings_new, [1, $final, $orig]);
}
$cur_node = $replace_config->{"words"};
next;
@@ -1239,11 +1271,11 @@ sub replace_group {
# Perform all replacements on $word, first removing all
# characters specified in $chars
sub replace_strip_chars {
- my ($config, $chars, $word) = @_;
+ my ($config, $args, $chars, $word) = @_;
foreach my $char (@$chars) {
$word =~ s/\Q$char\E//g;
}
- return replace_line($config, $word);
+ return replace_line($config, $args, $word);
}
# Perform all replacements on $line based on $config
@@ -1253,13 +1285,29 @@ sub replace_strip_chars {
# transliterated string, and lastly the original string.
# If the first element is 0, the second two elements are obviously same
sub replace_line {
- my ($config, $line) = @_;
+ my ($config, $args, $line) = @_;
my $substrings = [[0, $line, $line]];
foreach my $replacement (@{$config->{"replacements"}}) {
if ($replacement->{"type"} eq "match") {
- replace_match($config, $replacement, $substrings);
+ my $debug_msg;
+ if ($args->{"debug"}) {
+ my $options = join " ", keys(%{$replacement->{"options"}});
+ $debug_msg = "Match ($options): \"$replacement->{search}\"";
+ if ($replacement->{"replace"}) {
+ $debug_msg .= " \"$replacement->{replace}\"\n";
+ } else {
+ $debug_msg .= " (ignore)\n";
+ }
+ }
+ replace_match($config, $args, $replacement, $substrings, $debug_msg);
} elsif ($replacement->{"type"} eq "group") {
- replace_group($config, $replacement, $substrings);
+ my $debug_msg;
+ if ($args->{"debug"}) {
+ my $options = join " ", keys(%{$replacement->{"options"}});
+ my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
+ $debug_msg = "Group ($options): $tables\n";
+ }
+ replace_group($config, $args, $replacement, $substrings, $debug_msg);
}
}
# splits all words at the end so that the splitting characters
@@ -1294,8 +1342,7 @@ sub call_unknown_word_window {
}
my $action = prompt_unknown_word($contextl, $contextl_orig,
$word->[1], $word->[2], $contextr, $contextr_orig,
- $config, "$cur_lineno"
- );
+ $config, "$cur_lineno", $args);
# if $ret == 3, rest of line should be skipped
# if $ret == 2, config could not be loaded
# if $ret == 1, line must be redone with new config
@@ -1305,7 +1352,7 @@ sub call_unknown_word_window {
while ($ret == 2) {
$action = prompt_unknown_word($contextl, $contextl_orig,
$word->[1], $word->[2], $contextr, $contextr_orig,
- $config, "$cur_lineno", 1);
+ $config, "$cur_lineno", $args, 1);
$ret = handle_unknown_word_action($action, $config, $args);
}
return $ret;
@@ -1359,14 +1406,14 @@ sub replace {
$comment = $1;
}
my $nfd_line = NFD($line);
- my $substrings = replace_line($config, $nfd_line);
+ my $substrings = replace_line($config, $args, $nfd_line);
if (!$args->{"nounknowns"}) {
# re-transliterate the string if the config was reloaded
while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
- $substrings = replace_line($config, $nfd_line);
+ $substrings = replace_line($config, $args, $nfd_line);
}
- } elsif ($args->{"debug"}) {
+ } elsif ($args->{"debugspecial"}) {
foreach my $s (@$substrings) {
if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
warn "Unknown word: \"$s->[1]\"\n";
@@ -1378,10 +1425,10 @@ sub replace {
# "Open in unknown word window"
while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
if ($ret == 1) {
- $substrings = replace_line($config, $nfd_line);
+ $substrings = replace_line($config, $args, $nfd_line);
}
}
- } elsif ($args->{"debug"}) {
+ } elsif ($args->{"debugspecial"}) {
foreach my $s (@$substrings) {
if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
@@ -1399,7 +1446,7 @@ sub replace {
my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
GetOptions(
- \%args, "debug",
+ \%args, "debug", "debugspecial",
"nochoices", "nounknowns",
"force", "start=i",
"output=s", "config=s",
@@ -1542,6 +1589,15 @@ prompts.
=item B<--debug>
+Prints information helpful for debugging problems with the B<match> and B<group>
+statements.
+
+For each B<match> or B<group> statement which replaces anything, the original
+statement is printed (the format is a bit different than in the config) and
+each actual word that's replaced is printed.
+
+=item B<--debugspecial>
+
This option is only useful for automatic testing of the transliteration engine.
If B<--nochoices> is enabled, each word in the input with multiple choices will