Clean up a bit - transliterate - Transliteration engine

commit 49357ed325d344e830564abdaacb61976e1fca00
parent d08734f2e255b3f84a1e3f62df38e404800650c9
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 31 Mar 2020 16:43:21 +0200

Clean up a bit

Diffstat:
M transliterate.pl  | 47 ++++++++++++++---------------------------------

1 file changed, 14 insertions(+), 33 deletions(-)
diff --git a/transliterate.pl b/transliterate.pl
@@ -960,17 +960,17 @@ sub handle_unknown_word_action {
 	return 0;
 }
 
-# FIXME: This only splits off "lone" split characters or those at the border to a
-# transliterated block, in oder to keep compound words together for replace. The
-# cruft needs to be removed at some point.
-# Split $substrings into single words based on the "split" option
-# in $config.
+# Split $substrings based on the "split" regex in $config.
+# This only marks "lone" split characters or split characters at a
+# border between transliterated and untransliterated blocks as
+# transliterated in order to keep compound words together for 
+# `prompt_unknown_word`.
 # $substrings can already be split at this point; only the
 # ones that haven't been transliterated yet are modified
 sub split_words {
 	my ($config, $substrings) = @_;
-	# FIXME: is it more efficient to pre-compile with \A and \z individually?
-	my $split_re = qr/($config->{"split"})/;
+	my $split_pre = qr/\A($config->{"split"})/;
+	my $split_post = qr/($config->{"split"})\z/;
 	my @substrings_new;
 	#FIXME: cleanup
 	foreach my $cur_substr (@$substrings) {
@@ -979,12 +979,12 @@ sub split_words {
 			next;
 		}
 		my $str = $cur_substr->[1];
-		if ($str =~ /\A$split_re/) {
+		if ($str =~ /$split_pre/) {
 			push @substrings_new, [1, $1, $1];
 			$str = substr $str, length($1);
 		}
 		next if $str eq "";
-		if ($str =~ /$split_re\z/) {
+		if ($str =~ /$split_post/) {
 			$str = substr $str, 0, -length($1);
 			push @substrings_new, [0, $str, $str];
 			push @substrings_new, [1, $1, $1];
@@ -993,30 +993,6 @@ sub split_words {
 		}
 	}
 	@$substrings = @substrings_new;
-=pod
-	# FIXME: this is *probably* not needed anymore
-	my @substrings_new;
-	foreach my $cur_substr (@$substrings) {
-		if ($cur_substr->[0] == 1) {
-			push(@substrings_new, $cur_substr);
-			next;
-		}
-
-		my @words = split(/$split_re/, $cur_substr->[1]);
-		for my $i (0..$#words) {
-			# Word is not delimiter
-			# Split produces an empty field at the beginning if the string
-			# starts with the delimiter
-			if ($i % 2 == 0) {
-				push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
-			} else {
-				# Delimiters can count as already replaced
-				push(@substrings_new, [1, $words[$i], $words[$i]]);
-			}
-		}
-	}
-	@$substrings = @substrings_new;
-=cut
 }
 
 # small helper function to add a untransliterated string to the last substring
@@ -1239,6 +1215,11 @@ sub replace_line {
 # added and just that word is selected to ignore, you never get a chance to add a
 # replacement for the other word that it is attached to
 
+# NOTE: This is very ugly code. The GUI code is the worst, but this whole part
+# of the program is nasty. This is partially due to the fact that features kept
+# being added when their use was discovered. This problem might be fixed in the
+# future when I have time to rewrite all of this.
+
 # Handle unknown words
 # $substrings - the current substrings with unknown words
 # $config - the program config

	transliterate Transliteration engine
	git clone git://lumidify.org/transliterate.git
	Log \| Files \| Refs \| README \| LICENSE