Revert changes done to split_words - transliterate

commit de3bbf6acdd3d5f181717ef8b57d752d07040cb8
parent 6d2c4a738414c64f2e36968a96b2419eeded2441
Author: lumidify <nobody@lumidify.org>
Date:   Wed,  1 Apr 2020 15:39:01 +0200

Revert changes done to split_words

Diffstat:
M tests/test1/err.txt  | 3 ++-
M transliterate.pl  | 76 ++++++++++++++++++++++++++++++++--------------------------------------------

2 files changed, 34 insertions(+), 45 deletions(-)
diff --git a/tests/test1/err.txt b/tests/test1/err.txt
@@ -1,4 +1,5 @@
-Unknown word: "word20 word01231"
+Unknown word: "word20"
+Unknown word: "word01231"
 Word "word0_replaced$word0_replaced2" with 2 word choices.
 Unknown word: "aword1"
 Unknown word: "end3"
diff --git a/transliterate.pl b/transliterate.pl
@@ -943,35 +943,29 @@ sub handle_unknown_word_action {
 }
 
 # Split $substrings based on the "split" regex in $config.
-# This only marks "lone" split characters or split characters at a
-# border between transliterated and untransliterated blocks as
-# transliterated in order to keep compound words together for 
-# `prompt_unknown_word`.
 # $substrings can already be split at this point; only the
 # ones that haven't been transliterated yet are modified
 sub split_words {
 	my ($config, $substrings) = @_;
-	my $split_pre = qr/\A($config->{"split"})/;
-	my $split_post = qr/($config->{"split"})\z/;
+	my $split_re = qr/($config->{"split"})/;
 	my @substrings_new;
-	#FIXME: cleanup
 	foreach my $cur_substr (@$substrings) {
 		if ($cur_substr->[0] == 1) {
 			push(@substrings_new, $cur_substr);
 			next;
 		}
-		my $str = $cur_substr->[1];
-		if ($str =~ /$split_pre/) {
-			push @substrings_new, [1, $1, $1];
-			$str = substr $str, length($1);
-		}
-		next if $str eq "";
-		if ($str =~ /$split_post/) {
-			$str = substr $str, 0, -length($1);
-			push @substrings_new, [0, $str, $str];
-			push @substrings_new, [1, $1, $1];
-		} else {
-			push @substrings_new, [0, $str, $str];
+
+		my @words = split(/$split_re/, $cur_substr->[1]);
+		for my $i (0..$#words) {
+			# Word is not delimiter
+			# Split produces an empty field at the beginning if the string
+			# starts with the delimiter
+			if ($i % 2 == 0) {
+				push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
+			} else {
+				# Delimiters can count as already replaced
+				push(@substrings_new, [1, $words[$i], $words[$i]]);
+			}
 		}
 	}
 	@$substrings = @substrings_new;
@@ -1634,22 +1628,21 @@ statement on the text "c word1", there will still only be one chunk,
 properly.
 
 Once all the replacement statements have been processed, each chunk
-of text that is not marked as transliterated yet is "trimmed" based on
-the B<split> pattern specified in the config. This means that all
-"lone" split characters are marked as transliterated and any other
-untransliterated chunks have leading or trailing split characters
-marked as transliterated. At this point, only chunks of actual text that
-have not been transliterated are still marked as untransliterated.
-These are now processed by the L<unknown word window|/"UNKNOWN WORD WINDOW">.
-If one of these remaining unknown chunks is present in the file
-specified by the B<ignore> statement in the config, it is simply ignored
-and later printed out as is. After all untransliterated words have either
-had a replacement added or been ignored, any words with multiple replacement
-choices are processed by the word choice window. Once this is all done,
-the final output is written to the output file and the process is
-repeated with the next line. Note that the entire process is started
-again each time a word is added to a table or the config is reloaded
-from the L<unknown word window|/"UNKNOWN WORD WINDOW">.
+of text that is not marked as transliterated yet is split based on
+the B<split> pattern specified in the config and all actual characters
+matched by the B<split> pattern are marked as transliterated (this
+usually means all the spaces, newlines, quotation marks, etc.). Any
+remaining words/text chunks that are still marked as untransliterated are
+now processed by the unknown word window. If one of these remaining
+unknown chunks is present in the file specified by the B<ignore>
+statement in the config, it is simply ignored and later printed out
+as is. After all untransliterated words have either had a replacement
+added or been ignored, any words with multiple replacement choices are
+processed by the word choice window. Once this is all done, the final
+output is written to the output file and the process is repeated with
+the next line. Note that the entire process is started again each time
+a word is added to a table or the config is reloaded from the
+L<unknown word window|/"UNKNOWN WORD WINDOW">.
 
 =head1 CONFIGURATION
 
@@ -1702,15 +1695,10 @@ otherwise all of the newlines will be marked as unknown words. Usually,
 this will be included anyways through C<\s>.
 
 Note also that B<split> should probably include the C<+> RegEx-quantifier
-since that allows the splitting function in the end to also mark several
-splitting characters in a row as transliterated.
-
-This is named a bit confusingly since it was originally used to split
-the string completely based on the given pattern in the end. This was
-changed later, so a better name now would be "trim", but it's already
-called this way, so I don't feel like changing it. See the last
-paragraph of L</"INTERNALS/EXAMPLES"> for a short description of how
-the trimming works.
+since that allows the splitting function in the end to ignore several
+splitting characters right after each other (e.g. several spaces) in one
+go instead of splitting the string again for every single one of them.
+This shouldn't actually make any difference functionality-wise, though.
 
 B<Default:> C<\s+> (all whitespace)

	transliterate Transliteration engine
	git clone git://lumidify.org/transliterate.git
	Log \| Files \| Refs \| README \| LICENSE

M	tests/test1/err.txt	\|	3	++-
M	transliterate.pl	\|	76	++++++++++++++++++++++++++++++++--------------------------------------------