commit d08734f2e255b3f84a1e3f62df38e404800650c9
parent ec431a30af78b0cc2f936c8064cb57acecff5414
Author: lumidify <nobody@lumidify.org>
Date: Tue, 31 Mar 2020 16:20:19 +0200
Don't split compound words before asking for unknown words
Diffstat:
2 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/tests/test5/input.txt b/tests/test5/input.txt
@@ -1,3 +1,3 @@
ignore
-wörd0 word1end1
+wörd0 wörd0 word1end1
-dword9end2 word9end2-d
diff --git a/transliterate.pl b/transliterate.pl
@@ -960,14 +960,42 @@ sub handle_unknown_word_action {
return 0;
}
+# FIXME: This only splits off "lone" split characters or those at the border to a
+# transliterated block, in oder to keep compound words together for replace. The
+# cruft needs to be removed at some point.
# Split $substrings into single words based on the "split" option
# in $config.
# $substrings can already be split at this point; only the
# ones that haven't been transliterated yet are modified
sub split_words {
my ($config, $substrings) = @_;
- my @substrings_new;
+ # FIXME: is it more efficient to pre-compile with \A and \z individually?
my $split_re = qr/($config->{"split"})/;
+ my @substrings_new;
+ #FIXME: cleanup
+ foreach my $cur_substr (@$substrings) {
+ if ($cur_substr->[0] == 1) {
+ push(@substrings_new, $cur_substr);
+ next;
+ }
+ my $str = $cur_substr->[1];
+ if ($str =~ /\A$split_re/) {
+ push @substrings_new, [1, $1, $1];
+ $str = substr $str, length($1);
+ }
+ next if $str eq "";
+ if ($str =~ /$split_re\z/) {
+ $str = substr $str, 0, -length($1);
+ push @substrings_new, [0, $str, $str];
+ push @substrings_new, [1, $1, $1];
+ } else {
+ push @substrings_new, [0, $str, $str];
+ }
+ }
+ @$substrings = @substrings_new;
+=pod
+ # FIXME: this is *probably* not needed anymore
+ my @substrings_new;
foreach my $cur_substr (@$substrings) {
if ($cur_substr->[0] == 1) {
push(@substrings_new, $cur_substr);
@@ -988,6 +1016,7 @@ sub split_words {
}
}
@$substrings = @substrings_new;
+=cut
}
# small helper function to add a untransliterated string to the last substring