transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 49357ed325d344e830564abdaacb61976e1fca00
parent d08734f2e255b3f84a1e3f62df38e404800650c9
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 31 Mar 2020 16:43:21 +0200

Clean up a bit

Diffstat:
Mtransliterate.pl | 47++++++++++++++---------------------------------
1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/transliterate.pl b/transliterate.pl @@ -960,17 +960,17 @@ sub handle_unknown_word_action { return 0; } -# FIXME: This only splits off "lone" split characters or those at the border to a -# transliterated block, in oder to keep compound words together for replace. The -# cruft needs to be removed at some point. -# Split $substrings into single words based on the "split" option -# in $config. +# Split $substrings based on the "split" regex in $config. +# This only marks "lone" split characters or split characters at a +# border between transliterated and untransliterated blocks as +# transliterated in order to keep compound words together for +# `prompt_unknown_word`. # $substrings can already be split at this point; only the # ones that haven't been transliterated yet are modified sub split_words { my ($config, $substrings) = @_; - # FIXME: is it more efficient to pre-compile with \A and \z individually? - my $split_re = qr/($config->{"split"})/; + my $split_pre = qr/\A($config->{"split"})/; + my $split_post = qr/($config->{"split"})\z/; my @substrings_new; #FIXME: cleanup foreach my $cur_substr (@$substrings) { @@ -979,12 +979,12 @@ sub split_words { next; } my $str = $cur_substr->[1]; - if ($str =~ /\A$split_re/) { + if ($str =~ /$split_pre/) { push @substrings_new, [1, $1, $1]; $str = substr $str, length($1); } next if $str eq ""; - if ($str =~ /$split_re\z/) { + if ($str =~ /$split_post/) { $str = substr $str, 0, -length($1); push @substrings_new, [0, $str, $str]; push @substrings_new, [1, $1, $1]; @@ -993,30 +993,6 @@ sub split_words { } } @$substrings = @substrings_new; -=pod - # FIXME: this is *probably* not needed anymore - my @substrings_new; - foreach my $cur_substr (@$substrings) { - if ($cur_substr->[0] == 1) { - push(@substrings_new, $cur_substr); - next; - } - - my @words = split(/$split_re/, $cur_substr->[1]); - for my $i (0..$#words) { - # Word is not delimiter - # Split produces an empty field at the beginning if the string - # starts with the delimiter - if ($i % 2 == 0) { - push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne ''); - } else { - # Delimiters can count as already replaced - push(@substrings_new, [1, $words[$i], $words[$i]]); - } - } - } - @$substrings = @substrings_new; -=cut } # small helper function to add a untransliterated string to the last substring @@ -1239,6 +1215,11 @@ sub replace_line { # added and just that word is selected to ignore, you never get a chance to add a # replacement for the other word that it is attached to +# NOTE: This is very ugly code. The GUI code is the worst, but this whole part +# of the program is nasty. This is partially due to the fact that features kept +# being added when their use was discovered. This problem might be fixed in the +# future when I have time to rewrite all of this. + # Handle unknown words # $substrings - the current substrings with unknown words # $config - the program config