Fix 'Add replacement' with words requiring expansions in the GUI; tests currently broken - transliterate

commit 186074f5c746c6f3452da864722a9fd7da2dd964
parent 1adf70462c0ec838b8611ce505c139862a768e6d
Author: lumidify <nobody@lumidify.org>
Date:   Mon, 30 Mar 2020 13:04:56 +0200

Fix 'Add replacement' with words requiring expansions in the GUI; tests currently broken

Diffstat:
A tests/data/endings2.txt  | 2 ++
M tests/data/words.txt  | 1 +
A tests/test4/config  | 18 ++++++++++++++++++
A tests/test4/descr.txt  | 1 +
A tests/test4/err.txt  | 3 +++
A tests/test4/expected.txt  | 3 +++
A tests/test4/input.txt  | 1 +
M transliterate.pl  | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------

8 files changed, 163 insertions(+), 62 deletions(-)
diff --git a/tests/data/endings2.txt b/tests/data/endings2.txt
@@ -0,0 +1,2 @@
+end4	end4_replaced
+end5	end5_replaced
diff --git a/tests/data/words.txt b/tests/data/words.txt
@@ -8,3 +8,4 @@ word6	word6_replaced
 word7	word7_replaced
 word8	word8_replaced
 word9	word9_replaced
+word10	word10_replaced
diff --git a/tests/test4/config b/tests/test4/config
@@ -0,0 +1,18 @@
+split "\s+"
+beforeword "\s"
+afterword "\s"
+
+ignore "../data/ignore.txt"
+table words "../data/words.txt"
+table words1 "../data/words.txt"
+table endings "../data/endings.txt"
+table endings2 "../data/endings2.txt"
+
+expand words endings noroot
+expand words endings2 noroot
+expand words1 endings2 noroot
+
+group beginword endword
+replace words
+replace words1
+endgroup
diff --git a/tests/test4/descr.txt b/tests/test4/descr.txt
@@ -0,0 +1 @@
+matchignore only endword; expand noroot
diff --git a/tests/test4/err.txt b/tests/test4/err.txt
@@ -0,0 +1,3 @@
+Unknown word: "word0"
+Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
+Unknown word: "-dword9end2"
diff --git a/tests/test4/expected.txt b/tests/test4/expected.txt
@@ -0,0 +1,3 @@
+ignore
+word0   word1_replacedend1r1|word1_replacedend1r2
+-dword9end2 word9_replacedend2r-d
diff --git a/tests/test4/input.txt b/tests/test4/input.txt
@@ -0,0 +1 @@
+word10end1end5 word10end5
diff --git a/transliterate.pl b/transliterate.pl
@@ -561,28 +561,7 @@ sub load_ignore_table {
 # 0 - an error occurred
 # 1 - everything's fine
 sub expand_table {
-	my ($cmd, $tables, $config) = @_;
-	my $table_name = $cmd->[1]->{"value"};
-	my $new_table_name = $cmd->[2]->{"value"};
-	my $forms_name = $cmd->[3]->{"value"};
-	if ($new_table_name eq "same") {
-		$new_table_name = $table_name;
-	}
-	if (!exists($tables->{$table_name})) {
-		warn "expand_table: table \"$table_name\" does not exist.\n";
-		return 0;
-	}
-	if (!exists($tables->{$forms_name})) {
-		warn "expand_table: table \"$forms_name\" does not exist.\n";
-		return 0;
-	}
-	my $table = $tables->{$table_name};
-	my $forms = $tables->{$forms_name};
-
-	my $noroot = 0;
-	if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") {
-		$noroot = 1;
-	}
+	my ($table, $forms, $noroot, $config) = @_;
 
 	my %new_table;
 	foreach my $word (keys %$table) {
@@ -603,8 +582,7 @@ sub expand_table {
 			$new_table{$word} = $table->{$word};
 		}
 	}
-	$tables->{$new_table_name} = \%new_table;
-	return 1;
+	return \%new_table;
 }
 
 # Check if the number and types of arguments given to a config command are right
@@ -648,9 +626,19 @@ sub check_args {
 # $config_list - the list returned by parse_config
 sub interpret_config {
 	my ($config_list, $args) = @_;
+	my %tables;
 	my %config;
+	# table_paths stores a list of all table and replacement ids that are
+	# impacted by the path so the replacement can be added on the fly when
+	# a new replacement is added from the GUI
+	# the "replacement id" is just the number of the replacement group,
+	# starting at 0 with the first group in the config
 	$config{"table_paths"} = {};
-	$config{"tables"} = {};
+	# a mapping between the table ids and tables for all tables used as
+	# ending tables in expand statements - so expansions can be done
+	# on the fly when adding a replacement word from the GUI
+	$config{"ending_tables"} = {};
+	# ignore is the path to the ignore file, ignore_words the actual table
 	$config{"ignore"} = "";
 	$config{"ignore_words"} = {};
 	$config{"split"} = "\\s";
@@ -658,12 +646,16 @@ sub interpret_config {
 	$config{"afterword"} = "\\s";
 	$config{"tablesep"} = "\t";
 	$config{"choicesep"} = "\$";
+	# a list of "replacement configs", which specify the type and any
+	# other arguments (this is given to replace_match, etc.
 	$config{"replacements"} = [];
-	my %tmp_table_paths;
+	# these are temporary mappings used while loading the config
+	my %path_to_table;
+	my %table_id_to_path;
 	my %mandatory_args = (
 		"ignore" => [$STRING],
 		"table" => [$ID],
-		"expand" => [$ID, $ID, $ID],
+		"expand" => [$ID, $ID],
 		"match" => [$STRING, $STRING],
 		"matchignore" => [$STRING],
 		"replace" => [$ID],
@@ -688,17 +680,42 @@ sub interpret_config {
 				return undef;
 			}
 			if ($cmd->[0]->{"value"} eq "table") {
-				my $table = load_table $cmd->[2]->{"value"}, $args, \%config;
-				if (!$table) {
-					return undef;
+				my $table_path = $cmd->[2]->{"value"};
+				my $table;
+				# add to temporary path-to-table mapping so tables aren't
+				# loaded unnecessarily
+				if (exists $path_to_table{$table_path}) {
+					$table = $path_to_table{$table_path};
+				} else {
+					$table = load_table $table_path, $args, \%config;
+					$path_to_table{$table_path} = $table;
+					return if !$table;
 				}
-				$config{tables}->{$cmd->[1]->{"value"}} = $table;
-				$tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"};
+				my $table_id = $cmd->[1]->{"value"};
+				$tables{$table_id} = $table;
+				$table_id_to_path{$table_id} = $table_path;
 			} elsif ($cmd->[0]->{"value"} eq "expand") {
-				# FIXME: need to handle table paths when a new table name is used for the expansion
-				if (!expand_table($cmd, $config{tables}, \%config)) {
-					return undef;
+				my $orig_table_id = $cmd->[1]->{"value"};
+				my $ending_table_id = $cmd->[2]->{"value"};
+				my $noroot = 0;
+				if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") {
+					$noroot = 1;
+				}
+				if (!exists $tables{$orig_table_id}) {
+					warn "expand: table \"$orig_table_id\" doesn't exist\n";
+					return;
+				} elsif (!exists $tables{$ending_table_id}) {
+					warn "expand: table \"$ending_table_id\" doesn't exist\n";
+					return
 				}
+
+				$config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id};
+				$config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id};
+				push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot];
+
+				my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config);
+				return if !defined $new_table;
+				$tables{$orig_table_id} = $new_table;
 			} elsif ($cmd->[0]->{"value"} eq "group") {
 				if ($in_group) {
 					warn "ERROR: New group started without ending last one in config\n";
@@ -725,6 +742,7 @@ sub interpret_config {
 					"search" => NFD($cmd->[1]->{"value"}),
 					"replace" => $cmd->[2]->{"value"}};
 				for (3..$#$cmd) {
+					# add optional arguments as keys in replacement config
 					$config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
 				}
 			} elsif ($cmd->[0]->{"value"} eq "matchignore") {
@@ -742,17 +760,24 @@ sub interpret_config {
 					return undef;
 				}
 				my $table = $cmd->[1]->{"value"};
-				if (!exists($config{tables}->{$table})) {
+				if (!exists($tables{$table})) {
 					warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
 					return undef;
 				}
-				if (exists($tmp_table_paths{$table})) {
-					$config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table];
-				}
+
+				# make a list of all replacements that are affected by this
+				# file so that they can be updated when a word is added
+				# through the gui
+				my $table_path = $table_id_to_path{$table};
+				my $replacement_id = $#{$config{"replacements"}};
+				$config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
+				push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
+
 				# Note: we don't need to check if $table{"choicesep"} was defined
 				# here since we can't ever get this far without first having
 				# loaded a table anyways
-				add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $config{tables}->{$table}, $args, \%config);
+				my $trie_root = $config{"replacements"}->[-1]->{"words"};
+				add_to_trie($table, $trie_root, $tables{$table}, $args, \%config);
 			} elsif ($cmd->[0]->{"value"} eq "split") {
 				$config{"split"} = $cmd->[1]->{"value"};
 			} elsif ($cmd->[0]->{"value"} eq "beforeword") {
@@ -857,9 +882,21 @@ sub handle_unknown_word_action {
 		}
 		print($fh $word . $config->{tablesep} . $replace_word . "\n");
 		close($fh);
-		my $replacement_id = $config->{"table_paths"}->{$table_path}->[0];
-		my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
-		add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word}, $args, $config);
+		# loop over all table ids that are impacted by this file
+		foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) {
+			my $replacement_id = $replacement->[0];
+			my $table_id = $replacement->[1];
+			my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
+			my $final_table = {$word => $replace_word};
+			# handle expansions
+			foreach my $expand (@{$config->{"expands"}->{$table_id}}) {
+				my $ending_table_id = $expand->[0];
+				my $noroot = $expand->[1];
+				my $endings_table = $config->{"ending_tables"}->{$ending_table_id};
+				$final_table = expand_table $final_table, $endings_table, $noroot, $config;
+			}
+			add_to_trie($table_id, $trie, $final_table, $args, $config);
+		}
 		return 1;
 	} elsif ($action->[0] eq "reload") {
 		my $new_config = load_config $args;
@@ -1451,18 +1488,15 @@ pressed.
 
 The filtering for which table files are actually shown here is
 currently a bit rudimentary. First, all paths that are used in the
-B<table> statements in the config are put into a list. Then, the
-paths corresponding to any tables used for word endings in the
-B<expand> statements are removed from the list, and that is what
-is shown in this window. The reason for removing those paths from
-the list is that it gets somewhat confusing when all the tables
-that are only used for word endings are also in the list, and it
-is very unlikely that an unknown word would need to be written to
-one of those files. If necessary, the word can always be added
-manually and the config reloaded. Note also that only actual
-table paths are shown here, not the tables themselves - this is
-not necessarily a one-to-one mapping since new tables can be
-generated with the B<expand> statements.
+B<table> statements in the config are put into a list. Then, only
+the paths corresponding to table IDs actually used in B<replace>
+statements are added to the list that gets shown in the GUI. The
+reason for not showing all table paths in the list is that it gets
+somewhat confusing when all the tables that are only used for word
+endings are also in the list, and it is very unlikely that an
+unknown word would need to be written to one of those files. If
+necessary, the word can always be added manually and the config
+reloaded.
 
 =item Reload config
 
@@ -1570,6 +1604,17 @@ cannot be used as part of the replacement string in B<match> statements.
 Lookaheads and lookbehinds are fine, though, and could be useful in
 certain cases.
 
+All tables must be loaded before they are used, or there will be an
+error that the table does not exist.
+
+Warning: If a B<replace> statement is located before an B<expand>
+statement that would have impacted the table used, there will be no
+error but the expand statement won't have any impact.
+
+Basic rule of thumb: Always put the B<table> statements before the
+B<expand> statements and the B<expand> statements before the B<replace>
+statements.
+
 =over 8
 
 =item B<split> <regex string>
@@ -1643,12 +1688,21 @@ original script first and the replacement word second. The replacement word
 can optionally have several parts separated by B<choicesep>, which will cause the
 user to be prompted to choose one of the options.
 
-=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot]
+If, for whatever reason, the same table is needed twice, but with different endings,
+the table can simply be loaded twice with different IDs. If the same path is loaded,
+the table that has already been loaded will be reused.
+
+=item B<expand> <table identifier> <word ending table> [noroot]
 
 Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
 the word endings in C<< <word ending table> >>, saving the result as a table with the
-identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table
-identifier> >>, the original C<< <table identifier> >> is used instead.
+identifier C<< <new table identifier> >>.
+
+Note: There used to be a C<< <new table identifier> >> argument to create a new
+table in case one table had to be expanded with different endings. This has been
+removed because it was a bit ugly, especially since there wasn't a proper mapping
+from table IDs to filenames anymore. If this functionality is needed, the same table
+file can simply be loaded multiple times. See the B<table> section above.
 
 If B<noroot> is set, the root forms of the words are not kept.
 
@@ -1660,10 +1714,6 @@ Note that each of the root words is also split into its choices (if necessary)
 during the expanding, so it is possible to use B<choicesep> in both the endings
 and root words.
 
-Note that the paths of all tables used for C<< <word ending table> >> are removed from the
-list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD
-WINDOW"> for details.
-
 =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
 
 Perform a RegEx match using the given C<< <regex string> >>, replacing it with
@@ -1701,6 +1751,28 @@ End a replacement group.
 
 =back
 
+=head1 BUGS
+
+Although it may not seem like it, one of the ugliest parts of the program is the
+GUI functionality that allows the user to add a replacement word. The problem is
+that all information about the B<expand> and B<replace> statements has to be kept
+in order to properly handle adding a word to one of the files and simultaneously
+adding it to the currently loaded tables I<without reloading the entire config>.
+The way it currently works, the replacement word is directly written to the file,
+then all B<expand> statements that would have impacted the words from this file
+are redone (just for the newly added word) and the resulting words are added to
+the appropriate tables (or, technically, the appropriate 'trie'). Since a file
+can be mapped to multiple table IDs and a table ID can occur in multiple replace
+statements, this is more complicated than it sounds, and thus it is very likely
+that there are bugs lurking here somewhere. Do note that "Reload config" will
+B<always> reload the entire configuration, so that's safe to do even if the
+on-the-fly replacing doesn't work.
+
+In general, I have tested the GUI code much less than the rest since you can't
+really test it automatically very well.
+
+Tell me if you find any bugs.
+
 =head1 SEE ALSO
 
 perlre, perlretut

	transliterate Transliteration engine
	git clone git://lumidify.org/transliterate.git
	Log \| Files \| Refs \| README \| LICENSE

A	tests/data/endings2.txt	\|	2	++
M	tests/data/words.txt	\|	1	+
A	tests/test4/config	\|	18	++++++++++++++++++
A	tests/test4/descr.txt	\|	1	+
A	tests/test4/err.txt	\|	3	+++
A	tests/test4/expected.txt	\|	3	+++
A	tests/test4/input.txt	\|	1	+
M	transliterate.pl	\|	196	++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------