transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git (fast, but not encrypted)
git clone https://lumidify.org/transliterate.git (encrypted, but very slow)
git clone git://4kcetb7mo7hj6grozzybxtotsub5bempzo4lirzc3437amof2c2impyd.onion/transliterate.git (over tor)
Log | Files | Refs | README | LICENSE

transliterate.pl (83637B)


      1 #!/usr/bin/env perl
      2 
      3 # Proudly written using vi (OpenBSD nvi)
      4 
      5 # NOTE: If you're wondering why the error codes used by the functions are so
      6 # inconsistent, go ask my former self
      7 
      8 # NOTE 2: This codebase has grown as new features were needed, so it's quite
      9 # ugly now, but I haven't had time to clean it up.
     10 
     11 use strict;
     12 use warnings;
     13 use utf8;
     14 use feature 'unicode_strings';
     15 use open qw< :encoding(UTF-8) >;
     16 binmode STDIN, ":encoding(UTF-8)";
     17 binmode STDOUT, ":encoding(UTF-8)";
     18 binmode STDERR, ":encoding(UTF-8)";
     19 use Unicode::Normalize;
     20 use Glib qw/TRUE FALSE/;
     21 use Gtk3 '-init';
     22 use Getopt::Long;
     23 use Pod::Usage;
     24 use Scalar::Util qw(weaken);
     25 use File::Basename qw(dirname);
     26 use File::Spec::Functions qw(rel2abs file_name_is_absolute);
     27 
     28 # takes a string of words separated by '$config->{choicesep}' and returns a new string in the
     29 # same format with duplicates removed
     30 sub get_unique_words {
     31 	my ($word, $config) = @_;
     32 	my %tmp;
     33 	my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
     34 	return join $config->{choicesep}, @words_uniq;
     35 }
     36 
     37 # Adds all words in $words to $trie
     38 # Automatically combines duplicate words with "$config->{choicesep}" inbetween
     39 sub add_to_trie {
     40 	my ($table_name, $trie, $words, $args, $config, $override) = @_;
     41 	foreach my $word (keys %$words) {
     42 		my $cur_node = $trie;
     43 		foreach my $char (split //, $word) {
     44 			if (!exists($cur_node->{$char})) {
     45 				$cur_node->{$char}->{"parent"} = $cur_node;
     46 				# This is required to avoid circular references
     47 				# (otherwise, the garbage collector doesn't ever
     48 				# destroy these nodes, leading to the memory
     49 				# consumption growing without restraint if
     50 				# "Reload config" is used)
     51 				weaken($cur_node->{$char}->{"parent"});
     52 			}
     53 			$cur_node = $cur_node->{$char};
     54 		}
     55 		if (exists($cur_node->{"final"})) {
     56 			if ($override) {
     57 				$cur_node->{"final"} = $words->{$word};
     58 				next;
     59 			}
     60 			if ($args->{"checkduplicates"}) {
     61 				warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
     62 					"\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
     63 					"current occurrence as \"$words->{$word}\" in " .
     64 					"table \"$table_name\.\n";
     65 			}
     66 			$cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
     67 		} else {
     68 			$cur_node->{"final"} = $words->{$word};
     69 			if ($args->{"checkduplicates"}) {
     70 				$cur_node->{"table_name"} = $table_name;
     71 			}
     72 		}
     73 	}
     74 }
     75 
     76 # Prompt user when no replacement has been found for a word
     77 # $word is the word that was not found and $context* the context,
     78 # $word_repl is the replacement word - this is only used when the window is
     79 #            called from the word choice window, since the original and
     80 #            replacement aren't the same then
     81 # with context*_orig being the original, non-transliterated context.
     82 # $table_paths is a mapping of table paths (here, only the keys, i.e.
     83 # the actual paths, are used) to allow the user to choose a table to
     84 # save a new replacement to.
     85 # $cur_lineno is a display string to show the current line number
     86 # $config_error is an optional flag to specify whether an error
     87 # message should be displayed, informing the user that the config
     88 # could not be loaded (used when "Reload config" is clicked)
     89 # Returns: an array reference containing an action to be taken,
     90 # in the form ["<action name>", <optional args>].
     91 # See `handle_unknown_word_action` for currently accepted values
     92 sub prompt_unknown_word {
     93 	# yes, this function really should take fewer arguments...
     94 	# it would be better to just pass the substrings and an index
     95 	my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig,
     96 	    $config, $cur_lineno, $args, $config_error) = @_;
     97 	my $action;
     98 	my $stop = 0;
     99 
    100 	my $window = Gtk3::Window->new('toplevel');
    101 	$window->signal_connect(delete_event => sub {return FALSE});
    102 	$window->signal_connect(destroy => sub { Gtk3->main_quit; });
    103 	$window->set_border_width(10);
    104 
    105 	my $vbox = Gtk3::VBox->new(FALSE, 10);
    106 
    107 	my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
    108 	$vbox->pack_start($linelabel, FALSE, FALSE, 0);
    109 	$linelabel->show;
    110 
    111 	my $wordlabel = Gtk3::Label->new("Word not found: $word");
    112 	$wordlabel->set_alignment(0.0, 0.0);
    113 	$vbox->pack_start($wordlabel, FALSE, FALSE, 0);
    114 	$wordlabel->show;
    115 
    116 	# Make a text box with the given left and right context and label
    117 	# Also creates a button allowing the user to set the currently
    118 	# selected text as the word to be replaced - useful when only part
    119 	# of the entire word that was not found has to be replaced
    120 	my $make_context_box = sub {
    121 		# look, don't blame me for these miserably named variables...
    122 		my ($ctxtl, $wrd, $ctxtr, $lbl, $btn_lbl) = @_;
    123 		my $hbox = Gtk3::HBox->new(FALSE, 5);
    124 		my $label = Gtk3::Label->new_with_mnemonic($lbl);
    125 		my $text = Gtk3::TextView->new;
    126 		$label->set_mnemonic_widget($text);
    127 		$text->set_wrap_mode("word");
    128 		my $buffer = $text->get_buffer();
    129 		$buffer->set_text($ctxtr);
    130 		my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
    131 		my $start_iter = $buffer->get_start_iter();
    132 		$buffer->insert_with_tags($start_iter, $wrd, $highlight);
    133 		$start_iter = $buffer->get_start_iter();
    134 		$buffer->insert($start_iter, $ctxtl);
    135 		# set cursor position to beginning of word
    136 		my $cur_iter = $buffer->get_iter_at_offset(length($ctxtl));
    137 		$buffer->place_cursor($cur_iter);
    138 		my $button = Gtk3::Button->new($btn_lbl);
    139 		$button->signal_connect(
    140 			clicked => sub {
    141 				if (my ($start, $end) = $buffer->get_selection_bounds()) {
    142 					$word = $buffer->get_text($start, $end, FALSE);
    143 					$wordlabel->set_text("Selected: $word");
    144 				}
    145 			}, $window);
    146 		$hbox->pack_start($label, FALSE, FALSE, 0);
    147 		$hbox->pack_start($text, TRUE, TRUE, 0);
    148 		$vbox->pack_start($hbox, FALSE, FALSE, 0);
    149 		$hbox = Gtk3::HBox->new(FALSE, 5);
    150 		$hbox->pack_start($button, FALSE, FALSE, 0);
    151 		my $complete_text = $ctxtl . $wrd . $ctxtr;
    152 		$button = Gtk3::Button->new("Reset text");
    153 		$button->signal_connect(
    154 			clicked => sub {
    155 				$buffer->set_text($complete_text);
    156 			}, $window);
    157 		$hbox->pack_start($button, FALSE, FALSE, 0);
    158 		$vbox->pack_start($hbox, FALSE, FALSE, 0);
    159 	};
    160 	$make_context_box->(
    161 	    $contextl_orig, $word, $contextr_orig,
    162 	    "Ori_ginal context: ",
    163 	    "Use _original selection as word"
    164 	);
    165 	$make_context_box->(
    166 	    $contextl, $word_repl, $contextr,
    167 	    "Transliterated _context: ",
    168 	    "Use _transliterated selection as word"
    169 	);
    170 
    171 	my $hbox = Gtk3::HBox->new(FALSE, 5);
    172 	my $label = Gtk3::Label->new("Ignore: ");
    173 	$hbox->pack_start($label, FALSE, FALSE, 0);
    174 	my $button = Gtk3::Button->new("Th_is run");
    175 	$button->signal_connect(
    176 		clicked => sub {
    177 			$action = ["ignore", "run", $word];
    178 			$window->destroy;
    179 		}, $window);
    180 	$hbox->pack_start($button, FALSE, FALSE, 0);
    181 	$button = Gtk3::Button->new("_Permanently");
    182 	$button->signal_connect(
    183 		clicked => sub {
    184 			$action = ["ignore", "permanent", $word];
    185 			$window->destroy;
    186 		}, $window);
    187 	$hbox->pack_start($button, FALSE, FALSE, 0);
    188 	$button = Gtk3::Button->new("_Whole line");
    189 	$button->signal_connect(
    190 		clicked => sub {
    191 			$action = ["ignore", "wholeline"];
    192 			$window->destroy;
    193 		}, $window);
    194 	$hbox->pack_start($button, FALSE, FALSE, 0);
    195 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    196 
    197 	# AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE!
    198 	# Take note, kids - this is what happens when you keep adding
    199 	# features without rethinking your basic design.
    200 
    201 	$hbox = Gtk3::HBox->new(FALSE, 5);
    202 	$label = Gtk3::Label->new_with_mnemonic("Add to _list: ");
    203 	$hbox->pack_start($label, FALSE, FALSE, 0);
    204 	my $path_list = Gtk3::ComboBoxText->new;
    205 	$label->set_mnemonic_widget($path_list);
    206 	foreach my $path (sort keys %{$config->{"display_tables"}}) {
    207 		$path_list->append_text($path);
    208 	}
    209 	$hbox->pack_start($path_list, FALSE, FALSE, 0);
    210 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    211 
    212 	$hbox = Gtk3::HBox->new(FALSE, 5);
    213 	$label = Gtk3::Label->new_with_mnemonic("_Replacement: ");
    214 	$hbox->pack_start($label, FALSE, FALSE, 0);
    215 	my $replace_entry = Gtk3::Entry->new;
    216 	$label->set_mnemonic_widget($replace_entry);
    217 	$hbox->pack_start($replace_entry, TRUE, TRUE, 0);
    218 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    219 
    220 	if (exists $config->{"retrywithout"}) {
    221 		$hbox = Gtk3::HBox->new(FALSE, 5);
    222 		# Pressing Alt+e just activates the first of the retrywithout buttons
    223 		$label = Gtk3::Label->new_with_mnemonic("Retry without: ");
    224 		$hbox->pack_start($label, FALSE, FALSE, 0);
    225 		foreach my $without (@{$config->{"retrywithout"}}) {
    226 			$button = Gtk3::Button->new("$without->[0]");
    227 			$button->signal_connect(
    228 				clicked => sub {
    229 					my @chars = @{$without}[1..$#$without];
    230 					my $stripped = replace_strip_chars($config, $args, \@chars, $word);
    231 					# recombine substrings
    232 					my $repl_text = "";
    233 					$repl_text .= $_->[1] foreach @$stripped;
    234 					$replace_entry->set_text($repl_text);
    235 				}, $window);
    236 			$hbox->pack_start($button, FALSE, FALSE, 0);
    237 		}
    238 		$vbox->pack_start($hbox, FALSE, FALSE, 0);
    239 	}
    240 
    241 	$hbox = Gtk3::HBox->new(FALSE, 0);
    242 	$button = Gtk3::Button->new("_Add replacement");
    243 	$button->signal_connect(
    244 		clicked => sub {
    245 			if ($path_list->get_active != -1) {
    246 				$action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
    247 				$window->destroy;
    248 			}
    249 		}, $window);
    250 	$hbox->pack_start($button, FALSE, FALSE, 0);
    251 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    252 
    253 	$hbox = Gtk3::HBox->new(FALSE, 5);
    254 	$button = Gtk3::Button->new("_Stop processing");
    255 	$button->signal_connect(
    256 		clicked => sub {
    257 			$stop = 1;
    258 			$window->destroy;
    259 		}, $window);
    260 	$hbox->pack_start($button, FALSE, FALSE, 0);
    261 
    262 	$button = Gtk3::Button->new("Reload config");
    263 	$button->signal_connect(
    264 		clicked => sub {
    265 			$action = ["reload"];
    266 			$window->destroy;
    267 		}, $window);
    268 	$hbox->pack_start($button, FALSE, FALSE, 0);
    269 
    270 	if ($config_error) {
    271 		$label = Gtk3::Label->new("Error loading config; see terminal output for details");
    272 		$hbox->pack_start($label, FALSE, FALSE, 0);
    273 	}
    274 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    275 
    276 	$window->add($vbox);
    277 	$window->show_all;
    278 	Gtk3->main;
    279 
    280 	die "Processing stopped at line $cur_lineno\n" if $stop;
    281 
    282 	if (!$action) {
    283 		# This action isn't explicitly handled, but at least nothing
    284 		# breaks when the window is closed without selecting an action
    285 		$action = ["dummy"];
    286 	}
    287 	return $action;
    288 }
    289 
    290 # Prompt the user when a word has multiple replacement options (separated by $config->{choicesep})
    291 # $cur_lineno - display string to show the current line number
    292 # Returns:
    293 # 3, if this window needs to be called again but nothing needs
    294 #    to be re-transliterated
    295 # 1, if the line needs to be re-transliterated
    296 # 0, if nothing needs to be done
    297 sub prompt_choose_word {
    298 	my ($substrings, $config, $args, $cur_lineno) = @_;
    299 
    300 	# make a list of all substrings that contain multiple word options
    301 	my @replacements;
    302 	foreach (0..$#$substrings) {
    303 		if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
    304 			# This ugly bit of code is here as a special case for transliterating
    305 			# Hindi to Urdu text - if there are *exactly* two choices and one
    306 			# contains diacritics but the other one doesn't, the one with diacritics
    307 			# is automatically used instead of prompting the user.
    308 			if (exists $config->{"targetdiacritics"}) {
    309 				my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1];
    310 				my @diacritics = @{$config->{"targetdiacritics"}};
    311 				if (@choices == 2) {
    312 					@choices = map {NFD($_)} @choices;
    313 					my $first_matches = grep {$choices[0] =~ /$_/} @diacritics;
    314 					my $second_matches = grep {$choices[1] =~ /$_/} @diacritics;
    315 					if ($first_matches && !$second_matches) {
    316 						$substrings->[$_]->[1] = $choices[0];
    317 						next;
    318 					} elsif (!$first_matches && $second_matches) {
    319 						$substrings->[$_]->[1] = $choices[1];
    320 						next;
    321 					}
    322 				}
    323 			}
    324 			# Format of the elements in @replacements:
    325 			# [<id of substrings in $substrings>, <replacement word>, <original string>]
    326 			push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
    327 		}
    328 	}
    329 	# no substrings have multiple options
    330 	return if (!@replacements);
    331 
    332 	my $stop = 0;
    333 	my $open_unknown = 0;
    334 	my $cur_replacement = 0;
    335 
    336 	my $window = Gtk3::Window->new('toplevel');
    337 	$window->signal_connect(delete_event => sub {return FALSE});
    338 	$window->signal_connect(destroy => sub { Gtk3->main_quit; });
    339 	$window->set_border_width(10);
    340 
    341 	my $vbox = Gtk3::VBox->new(FALSE, 0);
    342 
    343 	my $linelabel = Gtk3::Label->new("Current line: $cur_lineno");
    344 	$vbox->pack_start($linelabel, FALSE, FALSE, 0);
    345 
    346 	my $wordlabel = Gtk3::Label->new("");
    347 	$wordlabel->set_alignment(0.0, 0.0);
    348 	$vbox->pack_start($wordlabel, FALSE, FALSE, 0);
    349 
    350 	my $undo = Gtk3::Button->new("_Undo");
    351 	$vbox->pack_start($undo, FALSE, FALSE, 0);
    352 	$undo->set_sensitive(FALSE);
    353 
    354 	my $button_vbox = Gtk3::VBox->new(FALSE, 0);
    355 	$vbox->pack_start($button_vbox, FALSE, FALSE, 0);
    356 
    357 	my $accept = Gtk3::Button->new("_Accept changes?");
    358 	$vbox->pack_start($accept, FALSE, FALSE, 0);
    359 
    360 	my $hbox = Gtk3::HBox->new(FALSE, 0);
    361 	my $label = Gtk3::Label->new("Context: ");
    362 	my $text = Gtk3::TextView->new;
    363 	$text->set_wrap_mode("word");
    364 	my $buffer = $text->get_buffer();
    365 	my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
    366 	$text->set_editable(FALSE);
    367 	$hbox->pack_start($label, FALSE, FALSE, 0);
    368 	$hbox->pack_start($text, TRUE, TRUE, 10);
    369 	$vbox->pack_start($hbox, FALSE, FALSE, 10);
    370 
    371 	$hbox = Gtk3::HBox->new(FALSE, 5);
    372 	my $skip_button = Gtk3::Button->new("Sk_ip word");
    373 	$hbox->pack_start($skip_button, FALSE, FALSE, 0);
    374 	my $unknown_button = Gtk3::Button->new("_Open in unknown word window");
    375 	$hbox->pack_start($unknown_button, FALSE, FALSE, 0);
    376 	my $stop_button = Gtk3::Button->new("_Stop processing");
    377 	$hbox->pack_start($stop_button, FALSE, FALSE, 0);
    378 	$vbox->pack_start($hbox, FALSE, FALSE, 0);
    379 
    380 	# generate the context to the left and to the right of the current word being replaced
    381 	my $get_context = sub {
    382 		my ($contextl, $contextr) = ("", "");
    383 		my $tmp_replacement = 0;
    384 		foreach (0..$#$substrings) {
    385 			my $word = $substrings->[$_]->[1];
    386 			if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
    387 				$word = $replacements[$tmp_replacement]->[1];
    388 				$tmp_replacement++;
    389 			}
    390 			# When nothing is left to replace, the entire string is in $contextl
    391 			if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
    392 				$contextl .= $word;
    393 			} elsif ($_ > $replacements[$cur_replacement]->[0]) {
    394 				$contextr .= $word;
    395 			}
    396 		}
    397 		return ($contextl, $contextr);
    398 	};
    399 
    400 	# fill the text buffer with the context and current word, highlighting the word
    401 	# if $cur_replacement is after the end of @replacements, don't highlight anything
    402 	# (this happens when all words have been replaced and the user only needs to accept the changes)
    403 	my $fill_text_buffer = sub {
    404 		my $start = $buffer->get_start_iter();
    405 		my $end = $buffer->get_end_iter();
    406 		$buffer->delete($start, $end);
    407 		my ($contextl, $contextr) = $get_context->();
    408 		$buffer->set_text($contextr);
    409 		if ($cur_replacement <= $#replacements) {
    410 			$start = $buffer->get_start_iter();
    411 			$buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
    412 		}
    413 		$start = $buffer->get_start_iter();
    414 		$buffer->insert($start, $contextl);
    415 	};
    416 
    417 	my $show_accept = sub {
    418 		$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
    419 		$accept->show;
    420 		$accept->grab_focus;
    421 		$wordlabel->set_text("");
    422 		$skip_button->set_sensitive(FALSE);
    423 		$unknown_button->set_sensitive(FALSE);
    424 	};
    425 
    426 	my $fill_button_vbox; # forward-declaration so it can be used here already
    427 	my $next_word = sub {
    428 		$undo->set_sensitive(TRUE);
    429 		$cur_replacement++;
    430 		$fill_text_buffer->();
    431 		if ($cur_replacement > $#replacements) {
    432 			$show_accept->();
    433 			return;
    434 		}
    435 		$fill_button_vbox->();
    436 	};
    437 
    438 	# fill $button_vbox with the word options for the current word
    439 	$fill_button_vbox = sub {
    440 		$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
    441 		my $word = $replacements[$cur_replacement]->[1];
    442 		$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
    443 		my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
    444 		if (exists $config->{"targetdiacritics"}) {
    445 			# This nasty bit of code finds the number of diacritics in every
    446 			# choice and sorts the choice in descending order based on that
    447 			my %choice_nums;
    448 			foreach my $choice (@choices) {
    449 				$choice_nums{$choice} = 0;
    450 				foreach my $diacritic (@{$config->{"targetdiacritics"}}) {
    451 					my @matches = NFD($choice) =~ /$diacritic/;
    452 					$choice_nums{$choice} += scalar @matches if @matches;
    453 				}
    454 			}
    455 			@choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices;
    456 		}
    457 		my $cur_num = 1;
    458 		foreach my $word_choice (@choices) {
    459 			# the mnemonics don't make sense when the number has more than one digit
    460 			my $choice_label = $cur_num <= 9 ? "_$cur_num: $word_choice" : $word_choice;
    461 			my $button = Gtk3::Button->new($choice_label);
    462 			$button->signal_connect(
    463 				clicked => sub {
    464 					$replacements[$cur_replacement]->[1] = $word_choice;
    465 					$next_word->();
    466 				}, $window);
    467 			$button_vbox->pack_start($button, FALSE, FALSE, 0);
    468 			$button->show;
    469 			$cur_num++;
    470 		}
    471 	};
    472 
    473 	$undo->signal_connect(
    474 		clicked => sub {
    475 			if ($cur_replacement > 0) {
    476 				$cur_replacement--;
    477 				if ($cur_replacement == 0) {
    478 					$undo->set_sensitive(FALSE);
    479 				}
    480 				$replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
    481 				$fill_button_vbox->();
    482 				$fill_text_buffer->();
    483 				$accept->hide;
    484 				$skip_button->set_sensitive(TRUE);
    485 				$unknown_button->set_sensitive(TRUE);
    486 				my $word = $replacements[$cur_replacement]->[1];
    487 				$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
    488 			}
    489 		}, $window);
    490 
    491 	$accept->signal_connect(
    492 		clicked => sub {
    493 			# write the changes to the original $substrings
    494 			foreach (@replacements) {
    495 				$substrings->[$_->[0]]->[1] = $_->[1];
    496 			}
    497 			$window->destroy;
    498 		}, $window);
    499 
    500 	$skip_button->signal_connect(clicked => $next_word, $window);
    501 
    502 	$unknown_button->signal_connect(
    503 		clicked => sub {
    504 			$open_unknown = 1;
    505 			$window->destroy;
    506 		}, $window);
    507 
    508 	$stop_button->signal_connect(
    509 		clicked => sub {
    510 			$stop = 1;
    511 			$window->destroy;
    512 		}, $window);
    513 
    514 	$fill_button_vbox->();
    515 	$fill_text_buffer->();
    516 
    517 	$window->add($vbox);
    518 	$window->show_all;
    519 	$accept->hide;
    520 	Gtk3->main;
    521 	die "Processing stopped at line $cur_lineno\n" if $stop;
    522 	if ($open_unknown) {
    523 		my $ret = call_unknown_word_window(
    524 			$substrings, $replacements[$cur_replacement]->[0],
    525 			$config, $args, $cur_lineno);
    526 		# the word choice window still needs to be called again
    527 		# when 0 is returned
    528 		return 3 if $ret == 0;
    529 		return $ret;
    530 	}
    531 	return 0;
    532 }
    533 
    534 my $ID = 0;
    535 my $STRING = 1;
    536 
    537 # Parse the configuration file into data type (currently only ID and STRING)
    538 sub parse_config {
    539 	my $f = shift;
    540 	my $fh;
    541 	if (!open($fh, "<", $f)) {
    542 		warn "Can't open config file \"$f\"!\n";
    543 		return;
    544 	}
    545 	my @commands;
    546 	my $state = 0;
    547 	my $IN_ID = 1;
    548 	my $IN_STR = 2;
    549 	my $cur_val = "";
    550 	while (my $line = <$fh>) {
    551 		chomp($line);
    552 		$state = 0;
    553 		push(@commands, []);
    554 		foreach my $char (split(//, $line)) {
    555 			if ($char eq "#" && !($state & $IN_STR)) {
    556 				last;
    557 			} elsif ($char eq '"') {
    558 				if ($state & $IN_STR) {
    559 					push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
    560 					$cur_val = "";
    561 					$state &= ~$IN_STR;
    562 				} else {
    563 					$cur_val = "";
    564 					$state |= $IN_STR;
    565 				}
    566 			} elsif ($char eq " ") {
    567 				if ($state & $IN_ID) {
    568 					push(@{$commands[-1]}, {type => $ID, value => $cur_val});
    569 					$state &= ~$IN_ID;
    570 					$cur_val = "";
    571 				} elsif ($state) {
    572 					$cur_val .= $char;
    573 				}
    574 			} else {
    575 				if (!$state) {
    576 					$state |= $IN_ID;
    577 				}
    578 				$cur_val .= $char;
    579 			}
    580 		}
    581 		if ($state & $IN_STR) {
    582 			warn "ERROR: Unterminated string in config:\n$line";
    583 			return;
    584 		} elsif ($cur_val) {
    585 			push(@{$commands[-1]}, {type => $ID, value => $cur_val});
    586 			$cur_val = "";
    587 		}
    588 		if ($#{$commands[-1]} == -1) {
    589 			pop(@commands);
    590 		}
    591 	}
    592 	close($fh);
    593 
    594 	return \@commands;
    595 }
    596 
    597 # if the path is relative, find its absolute location based
    598 # on the location of the config file
    599 sub open_file_rel_abs {
    600 	my ($filename, $config_file, $mode) = @_;
    601 	$mode //= "<";
    602 	if (!file_name_is_absolute $filename) {
    603 		my $config_dir = dirname $config_file;
    604 		$filename = rel2abs($filename, $config_dir);
    605 	}
    606 	my $fh;
    607 	if (!open $fh, $mode, $filename) {
    608 		warn "Can't open file \"$filename\"!\n";
    609 		return;
    610 	}
    611 	return $fh;
    612 }
    613 
    614 # Load a file of replacement words into a hash table
    615 sub load_table {
    616 	my ($filename, $args, $config, $revert) = @_;
    617 	my $fh = open_file_rel_abs $filename, $args->{"config"};
    618 	return if !$fh;
    619 	my %table;
    620 	while (my $line = <$fh>) {
    621 		chomp $line;
    622 		next if (!$line);
    623 		my @words = split(/\Q$config->{tablesep}\E/, $line);
    624 		if (@words != 2) {
    625 			warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
    626 			close $fh;
    627 			return;
    628 		}
    629 		my $word;
    630 		my $replacement;
    631 		if ($revert) {
    632 			$word = NFD $words[1];
    633 			$replacement = $words[0];
    634 		} else {
    635 			$word = NFD $words[0];
    636 			$replacement = $words[1];
    637 		}
    638 		my @word_choices = split /\Q$config->{choicesep}\E/, $word;
    639 		foreach my $word_choice (@word_choices) {
    640 			if (exists $table{$word_choice}) {
    641 				if ($args->{"checkduplicates"}) {
    642 					warn "WARNING: Duplicate word in file \"$filename\": " .
    643 						"\"$word_choice\", with replacement \"$replacement\", " .
    644 						"already exists with replacement \"$table{$word_choice}\".\n";
    645 				}
    646 				$table{$word_choice} = get_unique_words(
    647 					$table{$word_choice} .
    648 					$config->{choicesep} .
    649 					$replacement, $config);
    650 			} else {
    651 				$table{$word_choice} = $replacement;
    652 			}
    653 		}
    654 	}
    655 	close $fh;
    656 	return \%table;
    657 }
    658 
    659 # Load table for words to ignore - only the keys matter, since there is no replacement
    660 sub load_ignore_table {
    661 	my ($filename, $args) = @_;
    662 	my $fh = open_file_rel_abs $filename, $args->{"config"};
    663 	return if !$fh;
    664 	my %table;
    665 	while (my $line = <$fh>) {
    666 		chomp $line;
    667 		$table{NFD($line)} = "" if $line;
    668 	}
    669 	close $fh;
    670 	return \%table;
    671 }
    672 
    673 # Generate all forms of a word by combining it with endings
    674 # Returns:
    675 # 0 - an error occurred
    676 # 1 - everything's fine
    677 sub expand_table {
    678 	my ($table, $forms, $noroot, $config) = @_;
    679 	my %new_table;
    680 	foreach my $word (keys %$table) {
    681 		foreach my $ending (keys %$forms) {
    682 			# Some words and/or endings have multiple options, separated by $config->{choicesep}
    683 			# These must be temporarily separated in order to properly generate the forms
    684 			my @word_options;
    685 			my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
    686 			my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
    687 			foreach my $stem_option (@stem_options) {
    688 				foreach my $ending_option (@ending_options) {
    689 					push(@word_options, $stem_option . $ending_option);
    690 				}
    691 			}
    692 			$new_table{$word . $ending} = join($config->{choicesep}, @word_options);
    693 		}
    694 		$new_table{$word} = $table->{$word} if !$noroot;
    695 	}
    696 	return \%new_table;
    697 }
    698 
    699 # Check if the number and types of arguments given to a config command are right
    700 # Returns:
    701 # undef - the arguments don't match
    702 # 1 - the arguments match
    703 sub check_args {
    704 	my ($args, $cmd) = @_;
    705 	my $cmd_name = $cmd->[0]->{"value"};
    706 	if ($#$cmd - 1 < $#$args) {
    707 		my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
    708 		foreach my $arg (@{$cmd}[1..$#$cmd]) {
    709 			$err .= " " . $arg->{"value"}
    710 		}
    711 		warn "$err\n";
    712 		return;
    713 	}
    714 	my $arg_num = 0;
    715 	while ($arg_num <= $#$args) {
    716 		if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
    717 			my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
    718 			$err .= "Expected:";
    719 			foreach my $arg_type (@$args) {
    720 				$err .= " ID" if ($arg_type == $ID);
    721 				$err .= " STRING" if ($arg_type == $STRING);
    722 			}
    723 			$err .= "\nReceived:";
    724 			foreach my $arg (@{$cmd}[1..$#$cmd]) {
    725 				$err .= " ID" if ($arg->{"type"} == $ID);
    726 				$err .= " STRING" if ($arg->{"type"} == $STRING);
    727 			}
    728 			warn "$err\n";
    729 			return;
    730 		}
    731 		$arg_num++;
    732 	}
    733 	return 1;
    734 }
    735 
    736 # Interpret the config file - load and expand tables, etc.
    737 # $config_list - the list returned by parse_config
    738 sub interpret_config {
    739 	my ($config_list, $args) = @_;
    740 	my %tables;
    741 	my %config;
    742 	# table_paths stores a list of all table and replacement ids that are
    743 	# impacted by the path so the replacement can be added on the fly when
    744 	# a new replacement is added from the GUI
    745 	# the "replacement id" is just the number of the replacement group,
    746 	# starting at 0 with the first group in the config
    747 	$config{"table_paths"} = {};
    748 	# reverted_tables stores a hash of the paths of all tables that are
    749 	# reverted so that replacements added from the GUI are added in the
    750 	# right order
    751 	$config{"reverted_tables"} = {};
    752 	# these are the paths of the tables that are displayed in the GUI
    753 	$config{"display_tables"} = {};
    754 	# a mapping between the table ids and tables for all tables used as
    755 	# ending tables in expand statements - so expansions can be done
    756 	# on the fly when adding a replacement word from the GUI
    757 	$config{"ending_tables"} = {};
    758 	# ignore is the path to the ignore file, ignore_words the actual table
    759 	$config{"ignore"} = "";
    760 	$config{"ignore_words"} = {};
    761 	$config{"split"} = "\\s+";
    762 	$config{"beforeword"} = "\\s";
    763 	$config{"afterword"} = "\\s";
    764 	$config{"tablesep"} = "\t";
    765 	$config{"choicesep"} = "\$";
    766 	# a list of "replacement configs", which specify the type and any
    767 	# other arguments (this is given to replace_match, etc.
    768 	$config{"replacements"} = [];
    769 	# these are temporary mappings used while loading the config
    770 	my %path_to_table;
    771 	my %table_id_to_path;
    772 	my %mandatory_args = (
    773 		"ignore" => [$STRING],
    774 		"table" => [$ID],
    775 		"expand" => [$ID, $ID],
    776 		"match" => [$STRING, $STRING],
    777 		"matchignore" => [$STRING],
    778 		"replace" => [$ID],
    779 		"split" => [$STRING],
    780 		"beforeword" => [$STRING],
    781 		"afterword" => [$STRING],
    782 		"tablesep" => [$STRING],
    783 		"choicesep" => [$STRING],
    784 		"comment" => [$STRING],
    785 		"group" => [],
    786 		"endgroup" => [],
    787 		"retrywithout" => [$STRING],
    788 		"targetdiacritics" => [$STRING]
    789 	);
    790 	my $in_group = 0;
    791 	foreach my $cmd (@$config_list) {
    792 		# All load statements must be before expand statements
    793 		# All expand, beforeword, and afterword statements must be before replace statements
    794 		my $cmd_name = $cmd->[0]->{"value"};
    795 		if ($cmd->[0]->{"type"} == $ID) {
    796 			if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
    797 				warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
    798 				return;
    799 			}
    800 			return if !check_args($mandatory_args{$cmd_name}, $cmd);
    801 			if ($cmd_name eq "table") {
    802 				my $table_path = $cmd->[2]->{"value"};
    803 				my %table_args;
    804 				foreach (3..$#$cmd) {
    805 					$table_args{$cmd->[$_]->{"value"}} = 1;
    806 				}
    807 				my $table;
    808 				# add to temporary path-to-table mapping so tables aren't
    809 				# loaded unnecessarily
    810 				if (exists $path_to_table{$table_path}) {
    811 					$table = $path_to_table{$table_path};
    812 				} else {
    813 					$table = load_table $table_path, $args, \%config, $table_args{"revert"};
    814 					return if !defined $table;
    815 					$path_to_table{$table_path} = $table;
    816 				}
    817 				if ($table_args{"revert"}) {
    818 					$config{"reverted_tables"}->{$table_path} = 1;
    819 				}
    820 				my $table_id = $cmd->[1]->{"value"};
    821 				$tables{$table_id} = $table;
    822 				$table_id_to_path{$table_id} = $table_path;
    823 				# this is a hash to avoid duplicates if the same file
    824 				# is loaded multiple times
    825 				$config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"};
    826 			} elsif ($cmd_name eq "expand") {
    827 				my $orig_table_id = $cmd->[1]->{"value"};
    828 				my $ending_table_id = $cmd->[2]->{"value"};
    829 				my $noroot = 0;
    830 				if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") {
    831 					$noroot = 1;
    832 				}
    833 				if (!exists $tables{$orig_table_id}) {
    834 					warn "expand: table \"$orig_table_id\" doesn't exist\n";
    835 					return;
    836 				} elsif (!exists $tables{$ending_table_id}) {
    837 					warn "expand: table \"$ending_table_id\" doesn't exist\n";
    838 					return;
    839 				}
    840 
    841 				$config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id};
    842 				$config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id};
    843 				push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot];
    844 
    845 				my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config);
    846 				return if !$new_table;
    847 				$tables{$orig_table_id} = $new_table;
    848 			} elsif ($cmd_name eq "group") {
    849 				if ($in_group) {
    850 					warn "ERROR: New group started without ending last one in config\n";
    851 					return;
    852 				}
    853 				push @{$config{"replacements"}}, {
    854 					"type" => "group", "tables" => [],
    855 					"words" => {}, "options" => {}};
    856 				# add all options such as "endword" to the options hash
    857 				for (1..$#$cmd) {
    858 					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
    859 				}
    860 				$in_group = 1;
    861 			} elsif ($cmd_name eq "endgroup") {
    862 				if (!$in_group) {
    863 					warn "ERROR: endgroup command called while not in group\n";
    864 					return;
    865 				}
    866 				$in_group = 0;
    867 			} elsif ($cmd_name eq "match") {
    868 				if ($in_group) {
    869 					warn "ERROR: match command is invalid inside group\n";
    870 					return;
    871 				}
    872 				push @{$config{"replacements"}}, {
    873 					"type" => "match",
    874 					"options" => {},
    875 					"search" => NFD($cmd->[1]->{"value"}),
    876 					"replace" => $cmd->[2]->{"value"}};
    877 				for (3..$#$cmd) {
    878 					# add optional arguments as keys in options hash
    879 					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
    880 				}
    881 			} elsif ($cmd_name eq "matchignore") {
    882 				if ($in_group) {
    883 					warn "ERROR: matchignore command is invalid inside group\n";
    884 					return;
    885 				}
    886 				push @{$config{"replacements"}}, {
    887 					"type" => "match",
    888 					"options" => {},
    889 					"search" => NFD($cmd->[1]->{"value"})};
    890 				for (2..$#$cmd) {
    891 					$config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1;
    892 				}
    893 			} elsif ($cmd_name eq "replace") {
    894 				if (!$in_group) {
    895 					warn "ERROR: replace command called while not in group\n";
    896 					return;
    897 				}
    898 				my $table = $cmd->[1]->{"value"};
    899 				if (!exists($tables{$table})) {
    900 					warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
    901 					return;
    902 				}
    903 
    904 				# make a list of all replacements that are affected by this
    905 				# file so that they can be updated when a word is added
    906 				# through the gui
    907 				my $table_path = $table_id_to_path{$table};
    908 				my $replacement_id = $#{$config{"replacements"}};
    909 				$config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path};
    910 				push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table];
    911 
    912 				# store list of tables for --debug
    913 				push @{$config{"replacements"}->[-1]->{"tables"}}, $table;
    914 
    915 				# Note: we don't need to check if $table{"choicesep"} was defined
    916 				# here since we can't ever get this far without first having
    917 				# loaded a table anyways
    918 				my $trie_root = $config{"replacements"}->[-1]->{"words"};
    919 				my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override";
    920 				add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override);
    921 			} elsif ($cmd_name eq "retrywithout") {
    922 				if (!exists $config{"retrywithout"}) {
    923 					$config{"retrywithout"} = [];
    924 				}
    925 				# first value is the display name
    926 				my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd];
    927 				push @{$config{"retrywithout"}}, \@values;
    928 			} elsif ($cmd_name eq "targetdiacritics") {
    929 				if (!exists $config{$cmd_name}) {
    930 					$config{$cmd_name} = [];
    931 				}
    932 				foreach (1..$#$cmd) {
    933 					push @{$config{$cmd_name}}, $cmd->[$_]->{"value"};
    934 				}
    935 			} elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" ||
    936 				$cmd_name eq "afterword" || $cmd_name eq "tablesep" ||
    937 				$cmd_name eq "choicesep" || $cmd_name eq "comment") {
    938 				$config{$cmd_name} = $cmd->[1]->{"value"};
    939 			} elsif ($cmd_name eq "ignore") {
    940 				$config{"ignore"} = $cmd->[1]->{"value"};
    941 				my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
    942 				return if !defined $table;
    943 				$config{"ignore_words"} = $table;
    944 			} else {
    945 				warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n";
    946 				return;
    947 			}
    948 		} else {
    949 			my $err =  "ERROR: line does not start with command:\n";
    950 			foreach my $cmd_part (@$cmd) {
    951 				$err .= $cmd_part->{"value"};
    952 			}
    953 			warn "$err\n";
    954 			return;
    955 		}
    956 	}
    957 	if ($in_group) {
    958 		warn "ERROR: unclosed group in config\n";
    959 		return;
    960 	}
    961 	if (!$config{"ignore"}) {
    962 		warn "ERROR: no file of words to ignore specified.\n";
    963 		return;
    964 	}
    965 	if ($args->{"dumptables"}) {
    966 		foreach my $table_id (keys %tables) {
    967 			my $table_path = $table_id_to_path{$table_id};
    968 			if ($config{"display_tables"}->{$table_path}) {
    969 				for my $word (keys %{$tables{$table_id}}) {
    970 					print NFC($word) . "\n";
    971 				}
    972 			}
    973 		}
    974 	}
    975 	return \%config;
    976 }
    977 
    978 # load the config file
    979 # Returns:
    980 # the config hash or undef if an error occurred
    981 sub load_config {
    982 	my $args = shift;
    983 	my $config_list = parse_config($args->{"config"});
    984 	if (!$config_list) {
    985 		return;
    986 	}
    987 	return interpret_config $config_list, $args;
    988 }
    989 
    990 # Handle the action returned by `prompt_unknown_word`
    991 # $config - the current program config
    992 # $args - the command line arguments
    993 # Currently accepted values for $action:
    994 # ["ignore", "run", $word] - only ignore $word for the rest of this run
    995 # ["ignore", "permanent", $word] - write $word to the permanent ignore file
    996 # ["add", $word, $replace_word, $table_path] - add $word to the table 
    997 # 	corresponding to $table_path with $replace_word as its replacement. Note that
    998 # 	only tables directly corresponding to paths work here - tables that only
    999 # 	were created through "expand" in the config aren't ever shown separately
   1000 # 	in `prompt_unknown_word`
   1001 # ["reload"] - reload the configuration file
   1002 # Returns:
   1003 # 0 - nothing needs to be done
   1004 # 1 - the current line needs to be re-transliterated with the new config
   1005 # 2 - an error occurred while reloading the config
   1006 # 3 - stop asking for unknown words on this line
   1007 sub handle_unknown_word_action {
   1008 	my ($action, $config, $args) = @_;
   1009 	if ($action->[0] eq "ignore") {
   1010 		# yeah, this is a bit messy and inconsistent
   1011 		return 3 if $action->[1] eq "wholeline";
   1012 		$config->{"ignore_words"}->{$action->[2]} = "";
   1013 		if ($action->[1] eq "permanent") {
   1014 			my $fh = open_file_rel_abs $config->{"ignore"}, $args->{"config"}, ">>";
   1015 			return 1 if !$fh;
   1016 			print($fh $action->[2] . "\n");
   1017 			close($fh);
   1018 		} elsif ($action->[1] eq "run") {
   1019 			# Print to error file if ignore isn't permanent
   1020 			return 0 if ($args->{"errors"} eq "");
   1021 			my $fh;
   1022 			if (!open($fh, ">>", $args->{"errors"})) {
   1023 				warn "ERROR: Can't open error file \"$args->{errors}\".\n";
   1024 				return 0;
   1025 			}
   1026 			print($fh $action->[2] . "\n");
   1027 			close($fh);
   1028 		}
   1029 		return 0;
   1030 	} elsif ($action->[0] eq "add") {
   1031 		my $table_path = $action->[3];
   1032 		my $word = $action->[1];
   1033 		my $replace_word = $action->[2];
   1034 		# make sure to write the words in the correct order if the
   1035 		# tables were reverted while loading
   1036 		my $reverted = exists $config->{"reverted_tables"}->{$table_path};
   1037 		my $word_abs = $reverted ? $action->[2] : $action->[1];
   1038 		my $replace_word_abs = $reverted ? $action->[1] : $action->[2];
   1039 		my $fh = open_file_rel_abs $table_path, $args->{"config"}, ">>";
   1040 		return 1 if !$fh;
   1041 		print($fh $word_abs . $config->{tablesep} . $replace_word_abs . "\n");
   1042 		close($fh);
   1043 		# loop over all table ids that are impacted by this file
   1044 		foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) {
   1045 			my $replacement_id = $replacement->[0];
   1046 			my $table_id = $replacement->[1];
   1047 			my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
   1048 			my $final_table = {$word => $replace_word};
   1049 			# handle expansions
   1050 			foreach my $expand (@{$config->{"expands"}->{$table_id}}) {
   1051 				my $ending_table_id = $expand->[0];
   1052 				my $noroot = $expand->[1];
   1053 				my $endings_table = $config->{"ending_tables"}->{$ending_table_id};
   1054 				$final_table = expand_table $final_table, $endings_table, $noroot, $config;
   1055 			}
   1056 			add_to_trie($table_id, $trie, $final_table, $args, $config);
   1057 		}
   1058 		return 1;
   1059 	} elsif ($action->[0] eq "reload") {
   1060 		my $new_config = load_config $args;
   1061 		if ($new_config) {
   1062 			%$config = %$new_config;
   1063 			return 1;
   1064 		} else {
   1065 			return 2;
   1066 		}
   1067 	}
   1068 }
   1069 
   1070 # Split $substrings based on the "split" regex in $config.
   1071 # $substrings can already be split at this point; only the
   1072 # ones that haven't been transliterated yet are modified
   1073 sub split_words {
   1074 	my ($config, $substrings) = @_;
   1075 	my $split_re = qr/($config->{"split"})/;
   1076 	my @substrings_new;
   1077 	foreach my $cur_substr (@$substrings) {
   1078 		if ($cur_substr->[0] == 1) {
   1079 			push(@substrings_new, $cur_substr);
   1080 			next;
   1081 		}
   1082 
   1083 		my @words = split(/$split_re/, $cur_substr->[1]);
   1084 		for my $i (0..$#words) {
   1085 			# Word is not delimiter
   1086 			# Split produces an empty field at the beginning if the string
   1087 			# starts with the delimiter
   1088 			if ($i % 2 == 0) {
   1089 				push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
   1090 			} else {
   1091 				# Delimiters can count as already replaced
   1092 				push(@substrings_new, [1, $words[$i], $words[$i]]);
   1093 			}
   1094 		}
   1095 	}
   1096 	@$substrings = @substrings_new;
   1097 }
   1098 
   1099 # small helper function to add a untransliterated string to the last substring
   1100 # if that is not transliterated yet, or push it onto @$substrings otherwise
   1101 # -> used to keep all untransliterated text in one piece
   1102 # since this is also used for the "nofinal" attribute on "match", it takes
   1103 # an original and replaced string (since, when using "match" and "nofinal",
   1104 # the original string was replaced, but is still marked as unknown)
   1105 sub push_unknown {
   1106 	my ($substrings, $orig, $replaced) = @_;
   1107 	$replaced //= $orig;
   1108 	if (@$substrings && !$substrings->[-1]->[0]) {
   1109 		$substrings->[-1]->[1] .= $replaced;
   1110 		$substrings->[-1]->[2] .= $orig;
   1111 	} else {
   1112 		push(@$substrings, [0, $replaced, $orig]);
   1113 	}
   1114 }
   1115 
   1116 # Replace a word in $substrings based on $replace_config using regex
   1117 # $replace_config->{"search"} is the word to replace
   1118 # $replace_config->{"replace"} is the replacement word
   1119 #	if $replace_config->{"replace"} is undefined, just splits
   1120 #	$substrings at the the match and marks that the match has
   1121 #	been transliterated - currently used for "matchignore"
   1122 # $replace_config->{"beginword"}, $replace_config->{"afterword"} -
   1123 #	specifies if the match is only valid when $config->{"beforeword"}
   1124 #	or $config->{"afterword"} occur before or after it, respectively
   1125 sub replace_match {
   1126 	my ($config, $replace_config, $substrings, $debug_msg) = @_;
   1127 	my $beginword = exists $replace_config->{"options"}->{"beginword"};
   1128 	my $endword = exists $replace_config->{"options"}->{"endword"};
   1129 	my $fullword = $beginword && $endword;
   1130 	my $beforeword = $config->{"beforeword"};
   1131 	my $afterword = $config->{"afterword"};
   1132 	my $word = $replace_config->{"search"};
   1133 	my $replace_word = $replace_config->{"replace"};
   1134 	if ($fullword) {
   1135 		$word = qr/(\A|$beforeword)$word(\z|$afterword)/;
   1136 	} elsif ($beginword) {
   1137 		$word = qr/(\A|$beforeword)$word/;
   1138 	} elsif ($endword) {
   1139 		$word = qr/$word(\z|$afterword)/;
   1140 	} else {
   1141 		$word = qr/$word/;
   1142 	}
   1143 
   1144 	my @substrings_new;
   1145 	# only modify $substrings at all if the word was found
   1146 	my $found_word = 0;
   1147 	my $last_idx;
   1148 	# @substrings_new is only used if needed to improve efficiency
   1149 	foreach my $i (0..$#$substrings) {
   1150 		if ($substrings->[$i]->[0]) {
   1151 			# FIXME: is there a way to make it more efficient by keeping the old array?
   1152 			# This is a major bottleneck
   1153 			# Note: the above statement *may* be a bit exaggerated
   1154 			if ($found_word) {
   1155 				push(@substrings_new, $substrings->[$i]);
   1156 			}
   1157 			next;
   1158 		}
   1159 		$last_idx = 0;
   1160 		my $i0 = 0;
   1161 		my $i1 = 0;
   1162 		while ($substrings->[$i]->[1] =~ m/$word/g) {
   1163 			if (!$found_word) {
   1164 				print $debug_msg if $debug_msg;
   1165 				$found_word = 1;
   1166 				if ($i != 0) {
   1167 					push(@substrings_new, @{$substrings}[0..$i-1]);
   1168 				}
   1169 			}
   1170 			# This mess is needed to reliably match $beforeword and $afterword and put the captured
   1171 			# "splitting" characters back into the text. This would be much easier just using
   1172 			# a lookbehind and lookahead, but I couldn't find a way to also match beginning and
   1173 			# end of string that way.
   1174 			$i0 = $-[0];
   1175 			$i1 = $+[0];
   1176 			if ($fullword) {
   1177 				$i0 += length($1);
   1178 				$i1 -= length($2);
   1179 				# pos need to be decreased so that matches still work right next to each other
   1180 				pos($substrings->[$i]->[1]) -= length($2);
   1181 			} elsif ($beginword) {
   1182 				$i0 += length($1);
   1183 			} elsif ($endword) {
   1184 				$i1 -= length($1);
   1185 				pos($substrings->[$i]->[1]) -= length($1);
   1186 			}
   1187 			if ($last_idx != $i0) {
   1188 				my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
   1189 				push_unknown \@substrings_new, $unknown;
   1190 			}
   1191 			my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
   1192 			my $replace_str = $replace_word // $orig_str;
   1193 			if ($replace_config->{"options"}->{"nofinal"}) {
   1194 				warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
   1195 				push_unknown \@substrings_new, $orig_str, $replace_str;
   1196 			} else {
   1197 				warn "Replaced \"$orig_str\" with \"$replace_str\"\n" if $debug_msg;
   1198 				push(@substrings_new, [1, $replace_str, $orig_str]);
   1199 			}
   1200 			$last_idx = $i1;
   1201 		}
   1202 		if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
   1203 			my $unknown = substr($substrings->[$i]->[1], $last_idx);
   1204 			push_unknown \@substrings_new, $unknown;
   1205 		}
   1206 	}
   1207 	if ($found_word) {
   1208 		@$substrings = @substrings_new;
   1209 	}
   1210 }
   1211 
   1212 # Replace a group, i.e. replace all the words in a trie
   1213 # $replace_config->{"words"} - the root node of the trie
   1214 # $replace_config->{"beginword"}, $replace_config->{"endword"} -
   1215 #	same as in `replace_match`
   1216 sub replace_group {
   1217 	my ($config, $replace_config, $substrings, $debug_msg) = @_;
   1218 	my @substrings_new;
   1219 	my $word_found = 0;
   1220 	# Recurse backwords towards the root node of the trie to find the first
   1221 	# node with a key "final" which satisfies the ending condition (if "endword" is set)
   1222 	# Returns the id *after* the last match and the node that was found
   1223 	# with the key "final" (or undef, if nothing matched)
   1224 	my $find_final = sub {
   1225 		my ($i, $tmp_cur_node, $s) = @_;
   1226 		do {
   1227 			my $after_ch = substr($s->[1], $i, 1);
   1228 			if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) ||
   1229 				$after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
   1230 				return ($i, $tmp_cur_node);
   1231 			}
   1232 			$i--;
   1233 		} while ($tmp_cur_node = $tmp_cur_node->{"parent"});
   1234 		# none of the points were appropriate for breaking the word, so
   1235 		# $tmp_cur_node now points to the nonexistent parent node of the
   1236 		# root node
   1237 		return ($i, undef);
   1238 	};
   1239 	foreach my $s (@$substrings) {
   1240 		if ($s->[0]) {
   1241 			push(@substrings_new, $s);
   1242 			next;
   1243 		}
   1244 		my $cur_node = $replace_config->{"words"};
   1245 		my $start_i = 0;
   1246 		my $i = 0;
   1247 		# This deliberately goes off the end of the string! $cur_node is always "one behind" $i
   1248 		# since the node is only advanced in the iteration *after* $i has increased, meaning that
   1249 		# $i has to already be after the end of the string for the first if statement to definitely
   1250 		# fail, causing the elsif statement to handle that case
   1251 		while ($i <= length($s->[1])) {
   1252 			# This works even when $i is one index after the end of the string - it just returns "" then
   1253 			my $ch = substr($s->[1], $i, 1);
   1254 			if (exists $cur_node->{$ch}) {
   1255 				if ($cur_node == $replace_config->{"words"}) {
   1256 					my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
   1257 					if (exists($replace_config->{"options"}->{"beginword"}) &&
   1258 						$before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
   1259 						push_unknown \@substrings_new, $ch;
   1260 						$i++;
   1261 						next;
   1262 					}
   1263 					$start_i = $i;
   1264 				}
   1265 				$cur_node = $cur_node->{$ch};
   1266 			} elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
   1267 				my $tmp_cur_node = $cur_node;
   1268 				($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
   1269 				if (!defined($tmp_cur_node)) {
   1270 					push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
   1271 					$i += 2;
   1272 				} else {
   1273 					my $orig = substr($s->[1], $start_i, $i-$start_i);
   1274 					my $final = $tmp_cur_node->{"final"};
   1275 					if ($debug_msg) {
   1276 						if (!$word_found) {
   1277 							warn $debug_msg;
   1278 							$word_found = 1;
   1279 						}
   1280 						warn "Replaced \"$orig\" with \"$final\"\n";
   1281 					}
   1282 					push(@substrings_new, [1, $final, $orig]);
   1283 				}
   1284 				$cur_node = $replace_config->{"words"};
   1285 				next;
   1286 			} else {
   1287 				push_unknown \@substrings_new, $ch;
   1288 			}
   1289 			$i++;
   1290 		}
   1291 	}
   1292 	@$substrings = @substrings_new;
   1293 }
   1294 
   1295 # Perform all replacements on $word, first removing all
   1296 # characters specified in $chars
   1297 sub replace_strip_chars {
   1298 	my ($config, $args, $chars, $word) = @_;
   1299 	foreach my $char (@$chars) {
   1300 		$word =~ s/\Q$char\E//g;
   1301 	}
   1302 	return replace_line($config, $args, $word);
   1303 }
   1304 
   1305 # Perform all replacements on $line based on $config
   1306 # $substrings: array of arrays - each one has three elements:
   1307 # first 0 or 1, indicating if the substring following it has already
   1308 # been replaced or not (1 means it has been replaced), then the
   1309 # transliterated string, and lastly the original string.
   1310 # If the first element is 0, the second two elements are obviously same
   1311 sub replace_line {
   1312 	my ($config, $args, $line) = @_;
   1313 	my $substrings = [[0, $line, $line]];
   1314 	foreach my $replacement (@{$config->{"replacements"}}) {
   1315 		if ($replacement->{"type"} eq "match") {
   1316 			my $debug_msg;
   1317 			if ($args->{"debug"}) {
   1318 				my $options = join " ", keys(%{$replacement->{"options"}});
   1319 				$debug_msg =  "Match ($options): \"$replacement->{search}\"";
   1320 				if ($replacement->{"replace"}) {
   1321 					$debug_msg .= " \"$replacement->{replace}\"\n";
   1322 				} else {
   1323 					$debug_msg .= " (ignore)\n";
   1324 				}
   1325 			}
   1326 			replace_match($config, $replacement, $substrings, $debug_msg);
   1327 		} elsif ($replacement->{"type"} eq "group") {
   1328 			my $debug_msg;
   1329 			if ($args->{"debug"}) {
   1330 				my $options = join " ", keys(%{$replacement->{"options"}});
   1331 				my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"';
   1332 				$debug_msg = "Group ($options): $tables\n";
   1333 			}
   1334 			replace_group($config, $replacement, $substrings, $debug_msg);
   1335 		}
   1336 	}
   1337 	# splits all words at the end so that the splitting characters
   1338 	# aren't taken as unknown words and the unknown words are (hopefully)
   1339 	# in better chunks for prompting the user about them
   1340 	split_words($config, $substrings);
   1341 
   1342 	return $substrings;
   1343 }
   1344 
   1345 # Call the unknown word window with the given substrings and index
   1346 # See `get_unknown_words` for explanation of other parameters
   1347 # (should be obvious)
   1348 # Returns:
   1349 # 3, if the rest of the line should be skipped
   1350 # 1, if the line needs to be re-transliterated
   1351 # 0, if nothing needs to be done
   1352 sub call_unknown_word_window {
   1353 	my ($substrings, $i, $config, $args, $cur_lineno) = @_;
   1354 	my $word = $substrings->[$i];
   1355 	my $contextl = "";
   1356 	my $contextl_orig = "";
   1357 	foreach my $j (0..$i-1) {
   1358 		$contextl .= $substrings->[$j]->[1];
   1359 		$contextl_orig .= $substrings->[$j]->[2];
   1360 	}
   1361 	my $contextr = "";
   1362 	my $contextr_orig = "";
   1363 	foreach my $j ($i+1..$#$substrings) {
   1364 		$contextr .= $substrings->[$j]->[1];
   1365 		$contextr_orig .= $substrings->[$j]->[2];
   1366 	}
   1367 	my $action = prompt_unknown_word($contextl, $contextl_orig,
   1368 		$word->[1], $word->[2], $contextr, $contextr_orig,
   1369 		$config, "$cur_lineno", $args);
   1370 	# if $ret == 3, rest of line should be skipped
   1371 	# if $ret == 2, config could not be loaded
   1372 	# if $ret == 1, line must be redone with new config
   1373 	my $ret = handle_unknown_word_action($action, $config, $args);
   1374 	# keep retrying until the user chooses an action which
   1375 	# didn't throw an error
   1376 	while ($ret == 2) {
   1377 		$action = prompt_unknown_word($contextl, $contextl_orig,
   1378 			$word->[1], $word->[2], $contextr, $contextr_orig,
   1379 			$config, "$cur_lineno", $args, 1);
   1380 		$ret = handle_unknown_word_action($action, $config, $args);
   1381 	}
   1382 	return $ret;
   1383 }
   1384 
   1385 # NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
   1386 # If an ignore word is added which is attached to a word that should have a replacement
   1387 # added and just that word is selected to ignore, you never get a chance to add a
   1388 # replacement for the other word that it is attached to
   1389 
   1390 # NOTE: This is very ugly code. The GUI code is the worst, but this whole part
   1391 # of the program is nasty. This is partially due to the fact that features kept
   1392 # being added when their use was discovered. This problem might be fixed in the
   1393 # future when I have time to rewrite all of this.
   1394 
   1395 # Handle unknown words
   1396 # $substrings - the current substrings with unknown words
   1397 # $config - the program config
   1398 # $args - the command line args
   1399 # $cur_lineno - display string to show the user the current line number
   1400 # Returns:
   1401 # 1 - the line needs to be re-transliterated
   1402 # 0 - all done
   1403 sub get_unknown_words {
   1404 	my ($substrings, $config, $args, $cur_lineno) = @_;
   1405 	foreach my $i (0 .. $#$substrings) {
   1406 		my $word = $substrings->[$i];
   1407 		if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
   1408 			my $ret = call_unknown_word_window $substrings, $i, $config, $args, $cur_lineno;
   1409 			# 3 means we ignore the line
   1410 			if ($ret == 3) {
   1411 				foreach my $s (@$substrings) {
   1412 					# revert all changes done on the line
   1413 					$s->[1] = $s->[2];
   1414 				}
   1415 				return 0;
   1416 			}
   1417 			# 1 means the line needs to be re-transliterated
   1418 			return 1 if $ret == 1;
   1419 		}
   1420 		$i++;
   1421 	}
   1422 	return 0;
   1423 }
   1424 
   1425 # Main replacement function
   1426 # Opens the input file ($args->{"input"}) and writes the transliterated text
   1427 # to the file handle $outputfh, prompting the user for unknown words or
   1428 # word choices (if those aren't disabled on the command line)
   1429 sub replace {
   1430 	my ($config, $args, $total_lines, $inputfh, $outputfh) = @_;
   1431 	while (my $line = <$inputfh>) {
   1432 		next if $. < $args->{"start"};
   1433 		my $comment;
   1434 		if (exists $config->{"comment"}) {
   1435 			$line =~ s/\Q$config->{comment}\E(.*)\z//s;
   1436 			$comment = $1;
   1437 		}
   1438 		my $nfd_line = NFD($line);
   1439 		my $substrings = replace_line($config, $args, $nfd_line);
   1440 
   1441 		if (!$args->{"nounknowns"}) {
   1442 			# re-transliterate the string if the config was reloaded
   1443 			while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
   1444 				$substrings = replace_line($config, $args, $nfd_line);
   1445 			}
   1446 		} elsif ($args->{"debugspecial"}) {
   1447 			foreach my $s (@$substrings) {
   1448 				if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
   1449 					warn "Unknown word: \"$s->[1]\"\n";
   1450 				}
   1451 			}
   1452 		}
   1453 		if (!$args->{"nochoices"}) {
   1454 			# this only loops more than once if the user presses the button
   1455 			# "Open in unknown word window"
   1456 			while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) {
   1457 				if ($ret == 1) {
   1458 					$substrings = replace_line($config, $args, $nfd_line);
   1459 				}
   1460 			}
   1461 		} elsif ($args->{"debugspecial"}) {
   1462 			foreach my $s (@$substrings) {
   1463 				if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
   1464 					my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
   1465 					warn "Word \"$s->[1]\" with $num_choices word choices.\n";
   1466 				}
   1467 			}
   1468 		}
   1469 
   1470 		foreach (@$substrings) {
   1471 			print $outputfh $_->[1];
   1472 		}
   1473 		print $outputfh $comment if $comment;
   1474 	}
   1475 }
   1476 
   1477 my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
   1478 GetOptions(
   1479 	\%args, "debug", "debugspecial",
   1480 	"nochoices", "nounknowns",
   1481 	"force", "start=i",
   1482 	"output=s", "config=s",
   1483 	"errors=s", "help",
   1484 	"checkduplicates", "dumptables") or pod2usage(1);
   1485 
   1486 pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
   1487 pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1;
   1488 
   1489 if (!-f $args{"config"}) {
   1490 	die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
   1491 }
   1492 my $config = load_config \%args;
   1493 if (!$config) {
   1494 	die "ERROR: Invalid config\n";
   1495 }
   1496 exit 0 if ($args{"checkduplicates"} || $args{"dumptables"});
   1497 
   1498 my $inputfh;
   1499 my $total_lines = "UNKNOWN";
   1500 if (@ARGV < 1) {
   1501 	warn "WARNING: no input file supplied; taking input from STDIN\n";
   1502 	$inputfh = \*STDIN;
   1503 } else {
   1504 	open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
   1505 	# Is there *really* no more efficient way to get the total number of lines?
   1506 	$total_lines = 0;
   1507 	while (<$inputfh>) {$total_lines++};
   1508 	close $inputfh;
   1509 	open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n";
   1510 }
   1511 
   1512 if (-f $args{"errors"} && !$args{"force"}) {
   1513 	my $choice = "";
   1514 	while ($choice !~ /^[yn]$/) {
   1515 		print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
   1516 		$choice = <STDIN>;
   1517 		chomp $choice;
   1518 	}
   1519 	die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
   1520 }
   1521 
   1522 my $outputfh;
   1523 if ($args{"output"} eq "") {
   1524 	warn "WARNING: no output file supplied; printing to STDOUT\n";
   1525 	$outputfh = \*STDOUT;
   1526 } elsif (-f $args{"output"} && !$args{"force"}) {
   1527 	my $choice = "";
   1528 	while ($choice !~ /^[aoe]$/) {
   1529 		print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
   1530 		$choice = <STDIN>;
   1531 		chomp $choice;
   1532 	}
   1533 	if ($choice eq "a") {
   1534 		open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
   1535 	} elsif ($choice eq "e") {
   1536 		die "ERROR: \"$args{output}\" already exists.\n";
   1537 	} else {
   1538 		open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
   1539 	}
   1540 } else {
   1541 	open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
   1542 }
   1543 
   1544 replace($config, \%args, $total_lines, $inputfh, $outputfh);
   1545 close $inputfh;
   1546 close $outputfh;
   1547 
   1548 __END__
   1549 
   1550 =head1 NAME
   1551 
   1552 transliterate.pl - Transliterate text files
   1553 
   1554 =head1 SYNOPSIS
   1555 
   1556 transliterate.pl [options][input file]
   1557 
   1558 Start the transliteration engine with the given file as input.
   1559 The input file defaults to STDIN if no filename is given.
   1560 
   1561 =head1 OPTIONS
   1562 
   1563 =over 8
   1564 
   1565 =item B<--output> <filename>
   1566 
   1567 Sets the output file to print to.
   1568 
   1569 If the file exists already and B<--force> is not set, the user is asked
   1570 if the file should be overwritten or appended to.
   1571 
   1572 B<Default:> C<STDOUT> (print to terminal)
   1573 
   1574 =item B<--config> <filename>
   1575 
   1576 Sets the configuration file to use.
   1577 
   1578 B<Default:> C<config>
   1579 
   1580 =item B<--checkduplicates>
   1581 
   1582 Prints all duplicate words within single table files and across tables
   1583 that are replaced within the same group, then exits the program.
   1584 
   1585 Note that this simply prints B<all> duplicates, even ones that are
   1586 legitimate. When duplicates are found during normal operation of
   1587 the program, they are simply combined in exactly the same way as the
   1588 regular word choices.
   1589 
   1590 Also note that the words are still added as possible choices, which
   1591 may be slightly confusing. If, for instance, a word "word" is stored
   1592 in the tables "tablea", "tableb", and "tablec" with the replacements
   1593 "a", "b", and "c", the first duplicate message will say that the
   1594 first occurrence was in table "tablea" with the replacement "a", and
   1595 the second duplicate message will say that the first occurrence was
   1596 in table "tablea" with the replacement "a$b" (assuming $ is the
   1597 value set as B<choicesep> in the config). This is just something to
   1598 be aware of.
   1599 
   1600 On that note, before duplicates are checked between tables in the
   1601 same replacement group, duplicates inside the same file are already
   1602 replaced, so that might be a bit confusing as well.
   1603 
   1604 =item B<--dumptables>
   1605 
   1606 Prints the words of all tables that don't have B<nodisplay> set.
   1607 
   1608 This is mainly meant to be used for generating word lists in order to
   1609 use them in a spell checker. Note that the words printed here are in
   1610 UTF-8 NFC (Unicode Canonical Composition Form), so it may not be ideal
   1611 when the spellchecked text is not in the same form.
   1612 
   1613 =item B<--nochoices>
   1614 
   1615 Disables prompting for the right word when multiple replacement words exist.
   1616 
   1617 This can be used to "weed out" all the unknown words before
   1618 commencing the laborious task of choosing the right word every time
   1619 multiple options exist.
   1620 
   1621 =item B<--nounknowns>
   1622 
   1623 Disables prompting for the right word when a word is not found in the database.
   1624 
   1625 This can be used together with B<--nochoices> to perform a quick test of how
   1626 well the actual engine is working without having to click through all the
   1627 prompts.
   1628 
   1629 =item B<--debug>
   1630 
   1631 Prints information helpful for debugging problems with the B<match> and B<group>
   1632 statements.
   1633 
   1634 For each B<match> or B<group> statement which replaces anything, the original
   1635 statement is printed (the format is a bit different than in the config) and
   1636 each actual word that's replaced is printed.
   1637 
   1638 =item B<--debugspecial>
   1639 
   1640 This option is only useful for automatic testing of the transliteration engine.
   1641 
   1642 If B<--nochoices> is enabled, each word in the input with multiple choices will
   1643 be output, along with the number of choices (can be used to test the proper
   1644 functioning of B<choicesep> in the config file).
   1645 
   1646 If B<--nounknowns> is enabled, each unknown word in the input is printed
   1647 (can be used to test that the B<ignore> options are working correctly).
   1648 
   1649 =item B<--force>
   1650 
   1651 Always overwrites the output and error file without asking.
   1652 
   1653 =item B<--start> <line number>
   1654 
   1655 Starts at the given line number instead of the beginning of the file.
   1656 
   1657 Note: when "Stop processing" is pressed, the current line number is
   1658 printed out. This is the current line that was being processed, so it
   1659 has not been printed to the output file yet and thus the program must
   1660 be resumed at that line, not the one afterwards.
   1661 
   1662 =item B<--errors> <filename>
   1663 
   1664 Specifies a file to write errors in. Note that this does not refer to
   1665 actual errors, but to any words that were temporarily ignored
   1666 (i.e. words for which "Ignore: This run" was clicked).
   1667 
   1668 If no file is specified, nothing is written. If a file is specified
   1669 that already exists and B<--force> is not set, the user is prompted
   1670 for action.
   1671 
   1672 =item B<--help>
   1673 
   1674 Displays the full documentation.
   1675 
   1676 =back
   1677 
   1678 =head1 DESCRIPTION
   1679 
   1680 B<transliterate.pl> will read the given input file and transliterate it
   1681 based on the given configuration file, prompting the user for action if
   1682 a word has multiple replacement options or is not found in the database.
   1683 
   1684 See L</"CONFIGURATION"> for details on what is possible.
   1685 
   1686 Note that this is B<not> some sort of advanced transliteration engine
   1687 which understands the grammar of the language and tries to guess words
   1688 based on that. This is only a glorified find-and-replace program
   1689 with some extra features to make it useful for transliterating text
   1690 using large wordlists.
   1691 
   1692 WARNING: All input data is assumed to be UTF-8!
   1693 
   1694 =head1 WORD CHOICE WINDOW
   1695 
   1696 The word choice window is opened any time one word has multiple replacement
   1697 options and prompts the user to choose one.
   1698 
   1699 For each word with multiple options, the user must choose the right option
   1700 and then press "Accept changes" to finalize the transliteration of the
   1701 current line. The button to accept changes is selected by default, so it
   1702 is possible to just press enter instead of manually clicking it. Before the
   1703 line is finalized, the user may press "Undo" to undo any changes on the
   1704 current line.
   1705 
   1706 "Skip word" just leaves it as is. This shouldn't be needed in most cases
   1707 since B<choicesep> should always be set to a character that doesn't occur
   1708 normally in the text anyways.
   1709 
   1710 "Open in unknown word window" will open the
   1711 L<unknown word window|/"UNKNOWN WORD WINDOW"> with the current word
   1712 selected. This is meant as a helper if you notice that another word choice
   1713 needs to be added.
   1714 
   1715 Warning: This is very inconsistent and buggy! Since the unknown word window
   1716 is just opened directly, it isn't modified to make more sense for this
   1717 situation. Whenever "Add replacement" is pressed, the whole line is
   1718 re-transliterated as usual, but the word choice window is opened again
   1719 right afterwards. If you just want to go back to the word choice window,
   1720 press the ignore button for "whole line" since that shouldn't break
   1721 anything. There are weird inconsistencies, though - for instance, if you
   1722 delete all words in the tables, then press "Reload config", the line will
   1723 be re-transliterated and none of the words will actually be found, but it
   1724 will still go on because control passes back to the word choice window no
   1725 matter what. Also, none of the word choices that were already done on this
   1726 line are saved since the line is restarted from the beginning. As I said,
   1727 it's only there as a helper and is very buggy/inconsistent. Maybe I'll make
   1728 everything work better in a future release.
   1729 
   1730 "Stop processing" will exit the program and print the line number that was
   1731 currently being processed.
   1732 
   1733 =head1 UNKNOWN WORD WINDOW
   1734 
   1735 The unknown word window is opened any time a word could not be replaced.
   1736 
   1737 Both the context from the original script and the context from the
   1738 transliterated version (so far) is shown. If a part of the text is
   1739 selected in one of the text boxes and "Use selection as word" is
   1740 pressed for the appropriate box, the selected text is used for the
   1741 action that is taken subsequently. "Reset text" resets the text in
   1742 the text box to its original state (except for the highlight because
   1743 I'm too lazy to do that).
   1744 
   1745 The possible actions are:
   1746 
   1747 =over 8
   1748 
   1749 =item Ignore
   1750 
   1751 "This run" only ignores the word until the program exits, while
   1752 "Permanently" saves the word in the ignore file specified in the
   1753 configuration. "Whole line" stops asking for unknown words on
   1754 this line and prints the line out as it originally was in the
   1755 file. Note that any words in the original line that contain
   1756 B<choicesep> will still cause the L<word choice window|/"WORD CHOICE WINDOW">
   1757 to appear due to the way it is implemented. Just press "Skip word"
   1758 if that happens.
   1759 
   1760 =item Retry without <display name>
   1761 
   1762 Removes all characters specified in the corresponding B<retrywithout>
   1763 statement in the L<config|/"CONFIGURATION">
   1764 from the currently selected word and re-transliterates just that
   1765 word. The result is then pasted into the text box beside
   1766 "Add replacement" so it can be added to a table. This is only a
   1767 sort of helper for languages like Urdu in which words often can
   1768 be written with or without diacritics. If the "base form" without
   1769 diacritics is already in the tables, this button can be used to
   1770 quickly find the transliteration instead of having to type it
   1771 out again. Any part of the word that couldn't be transliterated
   1772 is just pasted verbatim into the text box (but after the
   1773 characters have been removed).
   1774 
   1775 Note that the selection can still be modified after this, before
   1776 pressing "Add to list". This could potentially be useful if a word
   1777 is in a table that is expanded using "noroot" because for instance
   1778 "Retry without diacritics" would only work with the full word (with
   1779 the ending), but only the stem should be added to the list. If that
   1780 is the case, "Retry without diacritics" could be pressed with the
   1781 whole word selected, but the ending could be removed before actually
   1782 pressing "Add to list".
   1783 
   1784 A separate button is shown for every B<retrywithout> statement
   1785 in the config.
   1786 
   1787 =item Add to list
   1788 
   1789 Adds the word typed in the text box beside "Add replacement" to the
   1790 selected table file as the replacement for the word currently selected
   1791 and re-runs the replacement on the current line. All table files that
   1792 do not have B<nodisplay> set are shown as options, see L</"CONFIGURATION">.
   1793 
   1794 Warning: This simply appends the word and its replacement to the end
   1795 of the file, so it will cause an error if there was no newline
   1796 ("\n") at the end of the file before.
   1797 
   1798 Note that this always re-transliterates the entire line afterwards.
   1799 This is to allow more flexibility. Consider, for instance, a compound
   1800 word of which the first part is also a valid single word. If the
   1801 entire line was not re-transliterated, it would be impossible to
   1802 add a replacement for that entire compound word and have it take
   1803 effect during the same run since the first part of the word would
   1804 not even be available for transliteration anymore.
   1805 
   1806 One problem is that the word is just written directly to the file
   1807 and there is no undo. This is the way it currently is and will
   1808 probably not change very soon. If a mistake is made, the word can
   1809 always be removed again manually from the list and "Reload config"
   1810 pressed.
   1811 
   1812 =item Reload config
   1813 
   1814 Reload the configuration file along with all tables an re-runs the
   1815 replacement on the current line. Note that this can take a short
   1816 while since the entire word database has to be reloaded.
   1817 
   1818 =item Stop processing
   1819 
   1820 Prints the current line number to the terminal and exits the program.
   1821 
   1822 The program can always be started again at this line number using
   1823 the B<--start> option if needed.
   1824 
   1825 =back
   1826 
   1827 =head1 INTERNALS/EXAMPLES
   1828 
   1829 This section was added to explain to the user how the transliteration
   1830 process works internally since that may be necessary to understand
   1831 why certain words are replaced the way they are.
   1832 
   1833 First off, the process works line-by-line, i.e. no B<match> or B<replace>
   1834 statement will ever match anything that crosses the end of a line.
   1835 
   1836 Each line is initially stored as one chunk which is marked as
   1837 untransliterated. Then, all B<match>, B<matchignore>, and B<replace>
   1838 (or, rather, B<group>) statements are executed in the order they
   1839 appear in the config file. Whenever a word/match is replaced, it
   1840 is split off into a separate chunk which is marked as transliterated.
   1841 A chunk marked as transliterated I<is entirely ignored by any
   1842 replacement statements that come afterwards>. Note that B<beginword>
   1843 and B<endword> can always match at the boundary between an
   1844 untransliterated and transliterated chunk. This is to facilitate
   1845 automated replacement of certain grammatical constructions. For instance:
   1846 
   1847 If the string "a-" could be attached as a prefix to any word and needed
   1848 to be replaced as "b-" everywhere, it would be quite trivial to add
   1849 a match statement C<'match "a-" "b-" beginword'>. If run on the text
   1850 "a-word", where "word" is some word that should be transliterated
   1851 as "word_replaced", and the group replace statement for the word comes
   1852 after the match statement given above, the following would happen:
   1853 First, the match statement would replace "a-" and split the text into
   1854 the two chunks "b-" and "word", where "b-" is already marked as
   1855 transliterated. Since "word" is now separate, it will be matched
   1856 by the group replace statement later, even if it has B<beginword> set
   1857 and would normally not match if "a-" came before it. Thus, the final
   1858 output will be "b-word_replaced", allowing for the uniform replacement
   1859 of the prefix instead of having to add each word twice, once with and
   1860 once without the prefix.
   1861 
   1862 In certain cases, this behavior may not be desired. Consider, for
   1863 instance, a prefix "c-" which cannot be replaced uniformly as in the
   1864 example above due to differences in the source and destination script.
   1865 Since it cannot be replaced uniformly, two words "word1" and "word2"
   1866 would both need to be specified separately with replacements for
   1867 "c-word1" and "c-word2". If, however, the prefix "c-" has an
   1868 alternate spelling "c " (without the hyphen), it would be very useful
   1869 to be able to automatically recognize that as well. This is where the
   1870 B<nofinal> attribute for the B<match> statements comes in. If there is
   1871 a match statement C<'match "c " "c-" beginword nofinal'>, the replaced
   1872 chunk is B<not> marked as transliterated, so after executing this
   1873 statement on the text "c word1", there will still only be one chunk,
   1874 "c-word1", allowing for the regular word replacements to function
   1875 properly.
   1876 
   1877 Once all the replacement statements have been processed, each chunk
   1878 of text that is not marked as transliterated yet is split based on
   1879 the B<split> pattern specified in the config and all actual characters
   1880 matched by the B<split> pattern are marked as transliterated (this
   1881 usually means all the spaces, newlines, quotation marks, etc.). Any
   1882 remaining words/text chunks that are still marked as untransliterated are
   1883 now processed by the unknown word window. If one of these remaining
   1884 unknown chunks is present in the file specified by the B<ignore>
   1885 statement in the config, it is simply ignored and later printed out
   1886 as is. After all untransliterated words have either had a replacement
   1887 added or been ignored, any words with multiple replacement choices are
   1888 processed by the word choice window. Once this is all done, the final
   1889 output is written to the output file and the process is repeated with
   1890 the next line. Note that the entire process is started again each time
   1891 a word is added to a table or the config is reloaded from the
   1892 L<unknown word window|/"UNKNOWN WORD WINDOW">.
   1893 
   1894 =head1 CONFIGURATION
   1895 
   1896 These are the commands accepted in the configuration file.
   1897 Any parameters in square brackets are optional.
   1898 Comments are started with C<#>. Strings (filenames, regex strings, etc.)
   1899 are enclosed in double quotes ("").
   1900 
   1901 The B<match>, B<matchignore>, and B<replace> commands are executed in
   1902 the order they are specified, except that all B<replace> commands within
   1903 the same group are replaced together.
   1904 
   1905 The B<match> and B<matchignore> statements accept any RegEx strings and
   1906 are thus very powerful. The B<group> statements only work with the
   1907 non-RegEx words from the tables, but are very efficient for large numbers
   1908 of words and should thus be used for the main bulk of the words.
   1909 
   1910 Any duplicate words found will cause the user to be prompted to choose
   1911 one option every time the word is replaced in the input text.
   1912 
   1913 Note that any regex strings specified in the config should B<not>
   1914 contain capture groups, as that would break the B<endword> functionality
   1915 since this is also implemented internally using capture groups. Capture
   1916 groups are also entirely pointless in the config since they currently
   1917 cannot be used as part of the replacement string in B<match> statements.
   1918 Lookaheads and lookbehinds are fine, though, and could be useful in
   1919 certain cases.
   1920 
   1921 All tables must be loaded before they are used, or there will be an
   1922 error that the table does not exist.
   1923 
   1924 Warning: If a B<replace> statement is located before an B<expand>
   1925 statement that would have impacted the table used, there will be no
   1926 error but the expand statement won't have any impact.
   1927 
   1928 Basic rule of thumb: Always put the B<table> statements before the
   1929 B<expand> statements and the B<expand> statements before the B<replace>
   1930 statements.
   1931 
   1932 =over 8
   1933 
   1934 =item B<split> <regex string>
   1935 
   1936 Sets the RegEx string to be used for splitting words. This is only used
   1937 for splitting the words which couldn't be replaced after all replacement
   1938 has been done, before prompting the user for unknown words.
   1939 
   1940 Note that B<split> should probably always contain at least C<\n>, since
   1941 otherwise all of the newlines will be marked as unknown words. Usually,
   1942 this will be included anyways through C<\s>.
   1943 
   1944 Note also that B<split> should probably include the C<+> RegEx-quantifier
   1945 since that allows the splitting function in the end to ignore several
   1946 splitting characters right after each other (e.g. several spaces) in one
   1947 go instead of splitting the string again for every single one of them.
   1948 This shouldn't actually make any difference functionality-wise, though.
   1949 
   1950 B<Default:> C<\s+> (all whitespace)
   1951 
   1952 =item B<beforeword> <regex string>
   1953 
   1954 Sets the RegEx string to be matched before a word if B<beginword> is set.
   1955 
   1956 B<Default:> C<\s>
   1957 
   1958 =item B<afterword> <regex string>
   1959 
   1960 Sets the RegEx string to be matched after a word if B<endword> is set.
   1961 
   1962 Note that B<afterword> should probably always contain at least C<\n>,
   1963 since otherwise words with B<endword> set will not be matched at the
   1964 end of a line.
   1965 
   1966 B<beforeword> and B<afterword> will often be exactly the same, but
   1967 they are left as separate options in case more fine-tuning is needed.
   1968 
   1969 B<Default:> C<\s>
   1970 
   1971 =item B<tablesep> <string>
   1972 
   1973 Sets the separator used to split the lines in the table files into the
   1974 original and replacement word.
   1975 
   1976 B<Default:> C<Tab>
   1977 
   1978 =item B<choicesep> <string>
   1979 
   1980 Sets the separator used to split replacement words into multiple choices for
   1981 prompting the user.
   1982 
   1983 B<Default:> C<$>
   1984 
   1985 =item B<comment> <string>
   1986 
   1987 If enabled, anything after C<< <string> >> will be ignored on all lines in
   1988 the input file. This will not be displayed in the
   1989 L<unknown word window|/"UNKNOWN WORD WINDOW"> or L<word choice window|/"WORD CHOICE WINDOW">
   1990 but will still be printed in the end, with the comment character removed
   1991 (that seems to be the most sensible thing to do).
   1992 
   1993 Note that this is really just a "dumb replacement", so there's no way to
   1994 prevent a line with the comment character from being ignored. Just try
   1995 to always set this to a character that does not occur anywhere in the text
   1996 (or don't use the option at all).
   1997 
   1998 =item B<ignore> <filename>
   1999 
   2000 Sets the file of words to ignore.
   2001 
   2002 This has to be set even if the file is just empty because the user can
   2003 add words to it from the unknown word window.
   2004 
   2005 =item B<table> <table identifier> <filename> [nodisplay] [revert]
   2006 
   2007 Load the table from C<< <filename> >>, making it available for later use in the
   2008 B<expand> and B<replace> commands using the identifier C<< <table identifier> >>.
   2009 
   2010 if B<nodisplay> is set, the filename for this table is not shown in the
   2011 L<unknown word window|/"UNKNOWN WORD WINDOW">. If, however, the same filename
   2012 is loaded again for another table that does not have B<nodisplay> set, it is
   2013 still displayed.
   2014 
   2015 If B<revert> is set, the original and replacement words are switched. This can
   2016 be useful for creating a config for transliterating in the opposite direction
   2017 with the same database. I don't know why I called it "revert" since it should
   2018 actually be called "reverse". I guess I was a bit confused.
   2019 
   2020 Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
   2021 to the location of the configuration file.
   2022 
   2023 The table files simply consist of B<tablesep>-separated values, with the word in the
   2024 original script first and the replacement word second. Both the original and
   2025 replacement word can optionally have several parts separated by B<choicesep>. If the
   2026 original word has multiple parts, it is separated and each of the parts is added
   2027 to the table with the replacement. If the replacement has multiple parts, the user
   2028 will be prompted to choose one of the options during the transliteration process.
   2029 If the same word occurs multiple times in the same table with different replacements,
   2030 the replacements are automatically added as choices that will be handled by the
   2031 L<word choice window|/"WORD CHOICE WINDOW">.
   2032 
   2033 If, for whatever reason, the same table is needed twice, but with different endings,
   2034 the table can simply be loaded twice with different IDs. If the same path is loaded,
   2035 the table that has already been loaded will be reused. Note that this feature was
   2036 added before adding B<revert>, so the old table is used even if it had B<revert>
   2037 set and the new one doesn't. This is technically a problem, but I don't know of
   2038 any real-world case where it would be a problem, so I'm too lazy to change it.
   2039 Tell me if it actually becomes a problem for you.
   2040 
   2041 WARNING: Don't load the same table file both with and without B<revert> in the same
   2042 config! When a replacement word is added through the GUI, the program has to know
   2043 which way to write the words. Currently, whenever a table file is loaded with
   2044 B<revert> anywhere in the config (even if it is loaded without B<revert> in a
   2045 different place), words will automatically be written as if B<revert> was on. I
   2046 cannot currently think of any reason why someone would want to load a file both
   2047 with and without B<revert> in the same config, but I still wanted to add this
   2048 warning just in case.
   2049 
   2050 =item B<expand> <table identifier> <word ending table> [noroot]
   2051 
   2052 Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
   2053 the word endings in C<< <word ending table> >>, saving the result as a table with the
   2054 identifier C<< <new table identifier> >>.
   2055 
   2056 Note: There used to be a C<< <new table identifier> >> argument to create a new
   2057 table in case one table had to be expanded with different endings. This has been
   2058 removed because it was a bit ugly, especially since there wasn't a proper mapping
   2059 from table IDs to filenames anymore. If this functionality is needed, the same table
   2060 file can simply be loaded multiple times. See the B<table> section above.
   2061 
   2062 If B<noroot> is set, the root forms of the words are not kept.
   2063 
   2064 If the replacement for a word ending contains B<choicesep>, it is split and each part
   2065 is combined with the root form separately and the user is prompted to choose one of
   2066 the options later. it is thus possible to allow multiple choices for the ending if
   2067 there is a distinction in the replacement script but not in the source script.
   2068 Note that each of the root words is also split into its choices (if necessary)
   2069 during the expanding, so it is possible to use B<choicesep> in both the endings
   2070 and root words.
   2071 
   2072 =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
   2073 
   2074 Perform a RegEx match using the given C<< <regex string> >>, replacing it with
   2075 C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
   2076 (e.g. groups) in it. B<beginword> and B<endword> specify whether the match must
   2077 be at the beginning or ending of a word, respectively, using the RegEx specified
   2078 in B<beforeword> and B<afterword>. If B<nofinal> is set, the string is not marked
   2079 as transliterated after the replacement, allowing it to be modified by subsequent
   2080 B<match> or B<replace> commands.
   2081 
   2082 =item B<matchignore> <regex string> [beginword] [endword]
   2083 
   2084 Performs a RegEx match in the same manner as B<match>, except that the original
   2085 match is used as the replacement instead of specifying a replacement string, i.e.
   2086 whatever is matched is just marked as transliterated without changing it.
   2087 
   2088 =item B<group> [beginword] [endword]
   2089 
   2090 Begins a replacement group. All B<replace> commands must occur between B<group>
   2091 and B<endgroup>, since they are then grouped together and replaced in one go.
   2092 B<beginword> and B<endword> act in the same way as specified for B<match> and
   2093 apply to all B<replace> statements in this group.
   2094 
   2095 =item B<replace> <table identifier> [override]
   2096 
   2097 Replace all words in the table with the identifier C<< <table identifier> >>,
   2098 using the B<beginword> and B<endword> settings specified by the current group.
   2099 
   2100 Unless B<override> is set on the latter table, if the same word occurs in two
   2101 tables with different replacements, both are automatically added as choices.
   2102 See L</"WORD CHOICE WINDOW">.
   2103 
   2104 B<override> can be useful if the same database is used for both directions and
   2105 one direction maps multiple words to one word, but in the other direction this
   2106 word should always default to one of the choices. In that case, a small table
   2107 with these special cases can be created and put at the end of the main B<group>
   2108 statement with B<override> set. This is technically redundant since you could
   2109 just add a special group with only the override table in it earlier in the
   2110 config, but it somehow seems cleaner this way.
   2111 
   2112 Note that a table must have been loaded before being used in a B<replace> statement.
   2113 
   2114 =item B<endgroup>
   2115 
   2116 End a replacement group.
   2117 
   2118 =item B<retrywithout> <display name> [character] [...]
   2119 
   2120 Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the
   2121 replacements on the selected word, first removing the given characters.
   2122 The button is named "<display name>" and located after the "Retry without" label.
   2123 Whatever is found with the replacements is pasted into the regular text box for
   2124 the "Add replacement" functionality.
   2125 
   2126 This can be used as an aid when, for instance, words can be written with or without
   2127 certain diacritics. If the actual word without diacritics is already in the
   2128 database and there is a B<retrywithout> statement for all the diacritics, the
   2129 button can be used to quickly find the replacement for the word instead of having
   2130 to type it out manually. The same goes for compound words that can be written
   2131 with or without a space.
   2132 
   2133 It is also possible to specify B<retrywithout> without any characters, which just
   2134 adds a button that takes whatever word is selected and retries the replacements
   2135 on it. This can be useful if you want to manually edit words and quickly see if
   2136 they are found with the edits in place.
   2137 
   2138 Note that all input text is first normalized to the unicode canonical decomposition
   2139 form so that diacritics can be removed individually.
   2140 
   2141 Also note that all buttons are currently just dumped in the GUI without any
   2142 sort of wrapping, so they'll run off the screen if there are too many.
   2143 Tell me if this becomes a problem. I'm just too lazy to change it right now.
   2144 
   2145 Small warning: This only removes the given characters from the word selected in
   2146 the GUI, not from the tables. Thus, this only works if the version of the word
   2147 without any of the characters is already present in the tables. It would be useful
   2148 when handling diacritics if the program could simply make a comparison while
   2149 completely ignoring diacritics, but I haven't figured out a nice way to implement
   2150 that yet.
   2151 
   2152 Historical note: This was called B<diacritics> in a previous version and only
   2153 allowed removal of diacritics. This is exactly the same functionality, just
   2154 generalized to allow removal of any characters with different buttons.
   2155 
   2156 =item B<targetdiacritics> <diacritic> [...]
   2157 
   2158 This was only added to simplify transliteration from Hindi to Urdu with the
   2159 same database. When this is set, the choices in the
   2160 L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order
   2161 based on the number of diacritics from this list that are matched in each
   2162 choice. This is so that when transliterating from Hindi to Urdu, the choice
   2163 with the most diacritics is always at the top.
   2164 
   2165 Additionally, if there are I<exactly> two choices for a word and one of
   2166 them contains diacritics but the other one doesn't, the one containing
   2167 diacritics is automatically taken without ever prompting the user. This
   2168 is, admittedly, a very language-specific feature, but I couldn't think of
   2169 a simple way of adding it without building it directly into the actual program.
   2170 
   2171 Note that due to the way this is implemented, it will not take any effect
   2172 if B<--nochoices> is enabled.
   2173 
   2174 The attentive reader will notice at this point that most of the features
   2175 in this program were added specifically for dealing with Urdu and Hindi,
   2176 which does appear to make sense, considering that this program was written
   2177 specifically for transliterating Urdu to Hindi and vice versa (although
   2178 not quite as much vice versa).
   2179 
   2180 =back
   2181 
   2182 =head1 BUGS
   2183 
   2184 Although it may not seem like it, one of the ugliest parts of the program is the
   2185 GUI functionality that allows the user to add a replacement word. The problem is
   2186 that all information about the B<expand> and B<replace> statements has to be kept
   2187 in order to properly handle adding a word to one of the files and simultaneously
   2188 adding it to the currently loaded tables I<without reloading the entire config>.
   2189 The way it currently works, the replacement word is directly written to the file,
   2190 then all B<expand> statements that would have impacted the words from this file
   2191 are redone (just for the newly added word) and the resulting words are added to
   2192 the appropriate tables (or, technically, the appropriate 'trie'). Since a file
   2193 can be mapped to multiple table IDs and a table ID can occur in multiple replace
   2194 statements, this is more complicated than it sounds, and thus it is very likely
   2195 that there are bugs lurking here somewhere. Do note that "Reload config" will
   2196 B<always> reload the entire configuration, so that's safe to do even if the
   2197 on-the-fly replacing doesn't work.
   2198 
   2199 In general, I have tested the GUI code much less than the rest since you can't
   2200 really test it automatically very well.
   2201 
   2202 The code is generally quite nasty, especially the parts belonging to the GUI.
   2203 Don't look at it.
   2204 
   2205 Tell me if you find any bugs.
   2206 
   2207 =head1 SEE ALSO
   2208 
   2209 perlre, perlretut
   2210 
   2211 =head1 LICENSE
   2212 
   2213 Copyright (c) 2019, 2020, 2021 lumidify <nobody[at]lumidify.org>
   2214 
   2215 Permission to use, copy, modify, and/or distribute this software for any
   2216 purpose with or without fee is hereby granted, provided that the above
   2217 copyright notice and this permission notice appear in all copies.
   2218 
   2219 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   2220 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   2221 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   2222 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   2223 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   2224 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   2225 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   2226 
   2227 =cut