transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit d08734f2e255b3f84a1e3f62df38e404800650c9
parent ec431a30af78b0cc2f936c8064cb57acecff5414
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 31 Mar 2020 16:20:19 +0200

Don't split compound words before asking for unknown words

Diffstat:
Mtests/test5/input.txt | 2+-
Mtransliterate.pl | 31++++++++++++++++++++++++++++++-
2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/tests/test5/input.txt b/tests/test5/input.txt @@ -1,3 +1,3 @@ ignore -wörd0 word1end1 +wörd0 wörd0 word1end1 -dword9end2 word9end2-d diff --git a/transliterate.pl b/transliterate.pl @@ -960,14 +960,42 @@ sub handle_unknown_word_action { return 0; } +# FIXME: This only splits off "lone" split characters or those at the border to a +# transliterated block, in oder to keep compound words together for replace. The +# cruft needs to be removed at some point. # Split $substrings into single words based on the "split" option # in $config. # $substrings can already be split at this point; only the # ones that haven't been transliterated yet are modified sub split_words { my ($config, $substrings) = @_; - my @substrings_new; + # FIXME: is it more efficient to pre-compile with \A and \z individually? my $split_re = qr/($config->{"split"})/; + my @substrings_new; + #FIXME: cleanup + foreach my $cur_substr (@$substrings) { + if ($cur_substr->[0] == 1) { + push(@substrings_new, $cur_substr); + next; + } + my $str = $cur_substr->[1]; + if ($str =~ /\A$split_re/) { + push @substrings_new, [1, $1, $1]; + $str = substr $str, length($1); + } + next if $str eq ""; + if ($str =~ /$split_re\z/) { + $str = substr $str, 0, -length($1); + push @substrings_new, [0, $str, $str]; + push @substrings_new, [1, $1, $1]; + } else { + push @substrings_new, [0, $str, $str]; + } + } + @$substrings = @substrings_new; +=pod + # FIXME: this is *probably* not needed anymore + my @substrings_new; foreach my $cur_substr (@$substrings) { if ($cur_substr->[0] == 1) { push(@substrings_new, $cur_substr); @@ -988,6 +1016,7 @@ sub split_words { } } @$substrings = @substrings_new; +=cut } # small helper function to add a untransliterated string to the last substring