commit de3bbf6acdd3d5f181717ef8b57d752d07040cb8
parent 6d2c4a738414c64f2e36968a96b2419eeded2441
Author: lumidify <nobody@lumidify.org>
Date: Wed, 1 Apr 2020 15:39:01 +0200
Revert changes done to split_words
Diffstat:
2 files changed, 34 insertions(+), 45 deletions(-)
diff --git a/tests/test1/err.txt b/tests/test1/err.txt
@@ -1,4 +1,5 @@
-Unknown word: "word20 word01231"
+Unknown word: "word20"
+Unknown word: "word01231"
Word "word0_replaced$word0_replaced2" with 2 word choices.
Unknown word: "aword1"
Unknown word: "end3"
diff --git a/transliterate.pl b/transliterate.pl
@@ -943,35 +943,29 @@ sub handle_unknown_word_action {
}
# Split $substrings based on the "split" regex in $config.
-# This only marks "lone" split characters or split characters at a
-# border between transliterated and untransliterated blocks as
-# transliterated in order to keep compound words together for
-# `prompt_unknown_word`.
# $substrings can already be split at this point; only the
# ones that haven't been transliterated yet are modified
sub split_words {
my ($config, $substrings) = @_;
- my $split_pre = qr/\A($config->{"split"})/;
- my $split_post = qr/($config->{"split"})\z/;
+ my $split_re = qr/($config->{"split"})/;
my @substrings_new;
- #FIXME: cleanup
foreach my $cur_substr (@$substrings) {
if ($cur_substr->[0] == 1) {
push(@substrings_new, $cur_substr);
next;
}
- my $str = $cur_substr->[1];
- if ($str =~ /$split_pre/) {
- push @substrings_new, [1, $1, $1];
- $str = substr $str, length($1);
- }
- next if $str eq "";
- if ($str =~ /$split_post/) {
- $str = substr $str, 0, -length($1);
- push @substrings_new, [0, $str, $str];
- push @substrings_new, [1, $1, $1];
- } else {
- push @substrings_new, [0, $str, $str];
+
+ my @words = split(/$split_re/, $cur_substr->[1]);
+ for my $i (0..$#words) {
+ # Word is not delimiter
+ # Split produces an empty field at the beginning if the string
+ # starts with the delimiter
+ if ($i % 2 == 0) {
+ push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
+ } else {
+ # Delimiters can count as already replaced
+ push(@substrings_new, [1, $words[$i], $words[$i]]);
+ }
}
}
@$substrings = @substrings_new;
@@ -1634,22 +1628,21 @@ statement on the text "c word1", there will still only be one chunk,
properly.
Once all the replacement statements have been processed, each chunk
-of text that is not marked as transliterated yet is "trimmed" based on
-the B<split> pattern specified in the config. This means that all
-"lone" split characters are marked as transliterated and any other
-untransliterated chunks have leading or trailing split characters
-marked as transliterated. At this point, only chunks of actual text that
-have not been transliterated are still marked as untransliterated.
-These are now processed by the L<unknown word window|/"UNKNOWN WORD WINDOW">.
-If one of these remaining unknown chunks is present in the file
-specified by the B<ignore> statement in the config, it is simply ignored
-and later printed out as is. After all untransliterated words have either
-had a replacement added or been ignored, any words with multiple replacement
-choices are processed by the word choice window. Once this is all done,
-the final output is written to the output file and the process is
-repeated with the next line. Note that the entire process is started
-again each time a word is added to a table or the config is reloaded
-from the L<unknown word window|/"UNKNOWN WORD WINDOW">.
+of text that is not marked as transliterated yet is split based on
+the B<split> pattern specified in the config and all actual characters
+matched by the B<split> pattern are marked as transliterated (this
+usually means all the spaces, newlines, quotation marks, etc.). Any
+remaining words/text chunks that are still marked as untransliterated are
+now processed by the unknown word window. If one of these remaining
+unknown chunks is present in the file specified by the B<ignore>
+statement in the config, it is simply ignored and later printed out
+as is. After all untransliterated words have either had a replacement
+added or been ignored, any words with multiple replacement choices are
+processed by the word choice window. Once this is all done, the final
+output is written to the output file and the process is repeated with
+the next line. Note that the entire process is started again each time
+a word is added to a table or the config is reloaded from the
+L<unknown word window|/"UNKNOWN WORD WINDOW">.
=head1 CONFIGURATION
@@ -1702,15 +1695,10 @@ otherwise all of the newlines will be marked as unknown words. Usually,
this will be included anyways through C<\s>.
Note also that B<split> should probably include the C<+> RegEx-quantifier
-since that allows the splitting function in the end to also mark several
-splitting characters in a row as transliterated.
-
-This is named a bit confusingly since it was originally used to split
-the string completely based on the given pattern in the end. This was
-changed later, so a better name now would be "trim", but it's already
-called this way, so I don't feel like changing it. See the last
-paragraph of L</"INTERNALS/EXAMPLES"> for a short description of how
-the trimming works.
+since that allows the splitting function in the end to ignore several
+splitting characters right after each other (e.g. several spaces) in one
+go instead of splitting the string again for every single one of them.
+This shouldn't actually make any difference functionality-wise, though.
B<Default:> C<\s+> (all whitespace)