Copy data from old repository - transliterate

commit 410d20484d0ceda89cfcad455b727bac02d071d9
Author: lumidify <nobody@lumidify.org>
Date:   Thu, 26 Mar 2020 07:37:37 +0100

Copy data from old repository

Diffstat:
A .gitignore  | 1 +
A README  | 5 +++++
A tests/alltests.sh  | 4 ++++
A tests/runtest.sh  | 13 +++++++++++++
A tests/test1/config  | 15 +++++++++++++++
A tests/test1/descr.txt  | 1 +
A tests/test1/err.txt  | 6 ++++++
A tests/test1/expected.txt  | 6 ++++++
A tests/test1/input.txt  | 6 ++++++
A tests/test2/config  | 19 +++++++++++++++++++
A tests/test2/descr.txt  | 1 +
A tests/test2/err.txt  | 3 +++
A tests/test2/expected.txt  | 2 ++
A tests/test2/input.txt  | 2 ++
A transliterate.pl  | 1605 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

15 files changed, 1689 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/README b/README
@@ -0,0 +1,5 @@
+See the perldoc in transliterate.pl for documentation (run perldoc -F transliterate.pl).
+
+The git history on this repository is not complete because the original
+repository contained a lot of private information that I didn't want to
+make available here.
diff --git a/tests/alltests.sh b/tests/alltests.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+./runtest.sh test1
+./runtest.sh test2
diff --git a/tests/runtest.sh b/tests/runtest.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1
+diff "$1/expected.txt" runtest.txt > /dev/null
+err1=$?
+diff "$1/err.txt" runtest_err.txt > /dev/null
+if [ $? -eq 0 -a $err1 -eq 0 ]; then
+  echo "OK `cat $1/descr.txt`"
+else
+  echo "FAILED `cat $1/descr.txt`"
+fi
+rm runtest.txt
+rm runtest_err.txt
diff --git a/tests/test1/config b/tests/test1/config
@@ -0,0 +1,15 @@
+split "[ \n]"
+beforeword " "
+afterword "[ \n]"
+
+ignore "../data/ignore.txt"
+table words "../data/words.txt"
+table endings "../data/endings.txt"
+
+expand words same endings
+
+match "\d+" "num_replaced" beginword
+
+group beginword endword
+replace words
+endgroup
diff --git a/tests/test1/descr.txt b/tests/test1/descr.txt
@@ -0,0 +1 @@
+Basic test
diff --git a/tests/test1/err.txt b/tests/test1/err.txt
@@ -0,0 +1,6 @@
+Unknown word: "word20"
+Unknown word: "word01231"
+Word "word0_replaced$word0_replaced2" with 2 word choices.
+Unknown word: "aword1"
+Unknown word: "end3"
+Word "word0_replacedend3_replaced$word0_replaced2end3_replaced" with 2 word choices.
diff --git a/tests/test1/expected.txt b/tests/test1/expected.txt
@@ -0,0 +1,6 @@
+word1_replaced word2_replaced
+num_replacedword1_replaced       word9_replaced   num_replaced   word4_replaced
+word20 word01231    word0_replaced$word0_replaced2
+aword1
+word1_replacedend1_replaced  word0_replacedend3_replaced$word0_replaced2end3_replaced   end3
+num_replacedword2_replacedend1_replaced
diff --git a/tests/test1/input.txt b/tests/test1/input.txt
@@ -0,0 +1,6 @@
+word1 word2
+123word1       word9   123   word4
+word20 word01231    word0
+aword1
+word1end1  word0end3   end3
+432word2end1
diff --git a/tests/test2/config b/tests/test2/config
@@ -0,0 +1,19 @@
+split "[ \n]+"
+beforeword " "
+afterword "[ \n]"
+tablesep ","
+choicesep "|"
+
+ignore "../data/ignore.txt"
+table words "../data/words1.txt"
+table endings "../data/endings_choices.txt"
+
+expand words other_name endings
+
+match "-\d" "-r" endword
+match "w ord" "word" beginword nofinal
+matchignore "\dhi\d"
+
+group beginword endword
+replace other_name
+endgroup
diff --git a/tests/test2/descr.txt b/tests/test2/descr.txt
@@ -0,0 +1 @@
+Different tablesep and linesep; match nofinal; ending choices
diff --git a/tests/test2/err.txt b/tests/test2/err.txt
@@ -0,0 +1,3 @@
+Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
+Unknown word: "4"
+Word "word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2" with 4 word choices.
diff --git a/tests/test2/expected.txt b/tests/test2/expected.txt
@@ -0,0 +1,2 @@
+5hi3 word9_replaced-r   word1_replacedend1r1|word1_replacedend1r2-r
+45hi1 word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2
diff --git a/tests/test2/input.txt b/tests/test2/input.txt
@@ -0,0 +1,2 @@
+5hi3 w ord9-0   w ord1end1-1
+45hi1 word0end1
diff --git a/transliterate.pl b/transliterate.pl
@@ -0,0 +1,1605 @@
+#!/usr/bin/env perl
+
+# Proudly written using vi (OpenBSD nvi)
+
+# NOTE: If you're wondering why the error codes used by the functions are so
+# inconsistent, go ask my former self
+
+use strict;
+use warnings;
+use utf8;
+use feature 'unicode_strings';
+use open qw< :encoding(UTF-8) >;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+use Unicode::Normalize;
+use Glib qw/TRUE FALSE/;
+use Gtk2 '-init';
+use Getopt::Long;
+use Pod::Usage;
+use Scalar::Util qw(weaken);
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(rel2abs file_name_is_absolute);
+
+# takes a string of words separated by '$' and returns a new string in the
+# same format with duplicates removed
+sub get_unique_words {
+	my ($word, $config) = @_;
+	my %tmp;
+	my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
+	return join $config->{choicesep}, @words_uniq;
+}
+
+# Adds all words in $words to $trie
+# Automatically combines duplicate words with "$" inbetween
+sub add_to_trie {
+	my ($table_name, $trie, $words, $args, $config) = @_;
+	foreach my $word (keys %$words) {
+		my $cur_node = $trie;
+		foreach my $char (split //, $word) {
+			if (!exists($cur_node->{$char})) {
+				$cur_node->{$char}->{"parent"} = $cur_node;
+				# This is required to avoid circular references
+				# (otherwise, the garbage collector doesn't ever
+				# destroy these nodes, leading to the memory
+				# consumption growing without restraint if
+				# "Reload config" is used)
+				weaken($cur_node->{$char}->{"parent"});
+			}
+			$cur_node = $cur_node->{$char};
+		}
+		if (exists($cur_node->{"final"})) {
+			if ($args->{"checkduplicates"}) {
+				warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
+					"\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
+					"current occurrence as \"$words->{$word}\" in " .
+					"table \"$table_name\.\n";
+			}
+			$cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
+		} else {
+			$cur_node->{"final"} = $words->{$word};
+			if ($args->{"checkduplicates"}) {
+				$cur_node->{"table_name"} = $table_name;
+			}
+		}
+	}
+}
+
+# Prompt user when no replacement has been found for a word
+# $word is the word that was not found and $context* the context,
+# with context*_orig being the original, non-transliterated context.
+# $table_paths is a mapping of table paths (here, only the keys, i.e.
+# the actual paths, are used) to allow the user to choose a table to
+# save a new replacement to.
+# $cur_lineno is a display string to show the current line number
+# $config_error is an optional flag to specify whether an error
+# message should be displayed, informing the user that the config
+# could not be loaded (used when "Reload config" is clicked)
+# Returns: an array reference containing an action to be taken,
+# in the form ["<action name>", <optional args>].
+# See `handle_unknown_word_action` for currently accepted values
+sub prompt_unknown_word {
+	my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_;
+	my $action;
+	my $stop = 0;
+
+	my $window = Gtk2::Window->new('toplevel');
+	$window->signal_connect(delete_event => sub {return FALSE});
+	$window->signal_connect(destroy => sub { Gtk2->main_quit; });
+	$window->set_border_width(10);
+
+	my $vbox = Gtk2::VBox->new(FALSE, 0);
+
+	my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
+	$vbox->pack_start($linelabel, FALSE, FALSE, 10);
+	$linelabel->show;
+
+	my $wordlabel = Gtk2::Label->new("Word not found: $word");
+	$wordlabel->set_alignment(0.0, 0.0);
+	$vbox->pack_start($wordlabel, FALSE, FALSE, 10);
+	$wordlabel->show;
+
+	# Make a text box with the given left and right context and label
+	# Also creates a button allowing the user to set the currently
+	# selected text as the word to be replaced - useful when only part
+	# of the entire word that was not found has to be replaced
+	my $make_context_box = sub {
+		my ($ctxtl, $ctxtr, $lbl) = @_;
+		my $hbox = Gtk2::HBox->new(FALSE, 0);
+		my $label = Gtk2::Label->new($lbl);
+		my $text = Gtk2::TextView->new;
+		my $buffer = $text->get_buffer();
+		$buffer->set_text($ctxtr);
+		my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
+		my $start = $buffer->get_start_iter();
+		$buffer->insert_with_tags($start, $word, $highlight);
+		$start = $buffer->get_start_iter();
+		$buffer->insert($start, $ctxtl);
+		my $button = Gtk2::Button->new("Use selection as word");
+		$button->signal_connect(
+			clicked => sub {
+				if (my ($start, $end) = $buffer->get_selection_bounds()) {
+					$word = $buffer->get_text($start, $end, FALSE);
+					$wordlabel->set_text("Selected: $word");
+				}
+			}, $window);
+		$hbox->pack_start($label, FALSE, FALSE, 0);
+		$hbox->pack_start($text, FALSE, FALSE, 10);
+		$label->show;
+		$text->show;
+		$vbox->pack_start($hbox, FALSE, FALSE, 10);
+		$hbox->show;
+		$hbox = Gtk2::HBox->new(FALSE, 0);
+		$hbox->pack_start($button, FALSE, FALSE, 0);
+		$button->show;
+		$vbox->pack_start($hbox, FALSE, FALSE, 0);
+		$hbox->show;
+	};
+	$make_context_box->($contextl, $contextr, "Context: ");
+	$make_context_box->($contextl_orig, $contextr_orig, "Original: ");
+
+	my $hbox = Gtk2::HBox->new(FALSE, 0);
+	my $label = Gtk2::Label->new("Ignore: ");
+	$hbox->pack_start($label, FALSE, FALSE, 0);
+	$label->show;
+	my $button = Gtk2::Button->new("This run");
+	$button->signal_connect(
+		clicked => sub {
+			$action = ["ignore", "run", $word];
+			$window->destroy;
+		}, $window);
+	$hbox->pack_start($button, FALSE, FALSE, 0);
+	$button->show;
+	$button = Gtk2::Button->new("Permanently");
+	$button->signal_connect(
+		clicked => sub {
+			$action = ["ignore", "permanent", $word];
+			$window->destroy;
+		}, $window);
+	$hbox->pack_start($button, FALSE, FALSE, 0);
+	$button->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 10);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	$label = Gtk2::Label->new("Add to list: ");
+	$hbox->pack_start($label, FALSE, FALSE, 0);
+	$label->show;
+	my $path_list = Gtk2::ComboBox->new_text;
+	foreach my $path (keys %$table_paths) {
+		$path_list->append_text($path);
+	}
+	$hbox->pack_start($path_list, FALSE, FALSE, 0);
+	$path_list->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 10);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	$label = Gtk2::Label->new("Replacement: ");
+	$hbox->pack_start($label, FALSE, FALSE, 0);
+	$label->show;
+	my $replace_entry = Gtk2::Entry->new;
+	$hbox->pack_start($replace_entry, FALSE, FALSE, 0);
+	$replace_entry->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 0);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	$button = Gtk2::Button->new("Add replacement");
+	$button->signal_connect(
+		clicked => sub {
+			if ($path_list->get_active != -1) {
+				$action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
+				$window->destroy;
+			}
+		}, $window);
+	$hbox->pack_start($button, FALSE, FALSE, 0);
+	$button->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 0);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	$button = Gtk2::Button->new("Stop processing");
+	$button->signal_connect(
+		clicked => sub {
+			$stop = 1;
+			$window->destroy;
+		}, $window);
+	$hbox->pack_start($button, FALSE, FALSE, 0);
+	$button->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 0);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	$button = Gtk2::Button->new("Reload config");
+	$button->signal_connect(
+		clicked => sub {
+			$action = ["reload"];
+			$window->destroy;
+		}, $window);
+	$hbox->pack_start($button, FALSE, FALSE, 0);
+	$button->show;
+
+	if ($config_error) {
+		$label = Gtk2::Label->new("Error loading config; see terminal output for details");
+		$hbox->pack_start($label, FALSE, FALSE, 0);
+		$label->show;
+	}
+	$vbox->pack_start($hbox, FALSE, FALSE, 0);
+	$hbox->show;
+
+	$window->add($vbox);
+	$vbox->show;
+	$window->show;
+	Gtk2->main;
+
+	if ($stop) {
+		die "Processing stopped at line $cur_lineno";
+	}
+	if (!$action) {
+		# This action isn't actually handled - it doesn't make much sense,
+		# but at least nothing breaks when the window is closed without
+		# selecting an action
+		$action = ["ignore", "once", $word];
+	}
+	return $action;
+}
+
+# Prompt the user when a word has multiple replacement options (separated by $)
+# $cur_lineno - display string to show the current line number
+sub prompt_choose_word {
+	my ($substrings, $cur_lineno, $config) = @_;
+
+	# make a list of all substrings that contain multiple word options
+	my @replacements;
+	foreach (0..$#$substrings) {
+		if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
+			# Format of the elements in @replacements:
+			# [<id of substrings in $substrings>, <replacement word>, <original string>]
+			push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
+		}
+	}
+	# no substrings have multiple options
+	return if (!@replacements);
+
+	my $stop = 0;
+	my $cur_replacement = 0;
+
+	my $window = Gtk2::Window->new('toplevel');
+	$window->signal_connect(delete_event => sub {return FALSE});
+	$window->signal_connect(destroy => sub { Gtk2->main_quit; });
+	$window->set_border_width(10);
+
+	my $vbox = Gtk2::VBox->new(FALSE, 0);
+
+	my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
+	$vbox->pack_start($linelabel, FALSE, FALSE, 0);
+	$linelabel->show;
+
+	my $wordlabel = Gtk2::Label->new("");
+	$wordlabel->set_alignment(0.0, 0.0);
+	$vbox->pack_start($wordlabel, FALSE, FALSE, 0);
+	$wordlabel->show;
+
+	my $undo = Gtk2::Button->new("Undo");
+	$vbox->pack_start($undo, FALSE, FALSE, 0);
+	$undo->show;
+	$undo->set_sensitive(FALSE);
+
+	my $button_vbox = Gtk2::VBox->new(FALSE, 0);
+	$vbox->pack_start($button_vbox, FALSE, FALSE, 0);
+	$button_vbox->show;
+
+	my $accept = Gtk2::Button->new("Accept changes?");
+	$vbox->pack_start($accept, FALSE, FALSE, 0);
+
+	my $hbox = Gtk2::HBox->new(FALSE, 0);
+	my $label = Gtk2::Label->new("Context: ");
+	my $text = Gtk2::TextView->new;
+	my $buffer = $text->get_buffer();
+	my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
+	$text->set_editable(FALSE);
+	$hbox->pack_start($label, FALSE, FALSE, 0);
+	$hbox->pack_start($text, FALSE, FALSE, 10);
+	$label->show;
+	$text->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 10);
+	$hbox->show;
+
+	$hbox = Gtk2::HBox->new(FALSE, 0);
+	my $stop_button = Gtk2::Button->new("Stop processing");
+	$hbox->pack_start($stop_button, FALSE, FALSE, 0);
+	$stop_button->show;
+	$vbox->pack_start($hbox, FALSE, FALSE, 0);
+	$hbox->show;
+
+	$window->add($vbox);
+	$vbox->show;
+	$window->show;
+
+	# generate the context to the left and to the right of the current word being replaced
+	my $get_context = sub {
+		my ($contextl, $contextr) = ("", "");
+		my $tmp_replacement = 0;
+		foreach (0..$#$substrings) {
+			my $word = $substrings->[$_]->[1];
+			if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
+				$word = $replacements[$tmp_replacement]->[1];
+				$tmp_replacement++;
+			}
+			# When nothing is left to replace, the entire string is in $contextl
+			if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
+				$contextl .= $word;
+			} elsif ($_ > $replacements[$cur_replacement]->[0]) {
+				$contextr .= $word;
+			}
+		}
+		return ($contextl, $contextr);
+	};
+
+	# fill the text buffer with the context and current word, highlighting the word
+	# if $cur_replacement is after the end of @replacements, don't highlight anythiing
+	# (this happens when all words have been replaced and the user only needs to accept the changes)
+	my $fill_text_buffer = sub {
+		my $start = $buffer->get_start_iter();
+		my $end = $buffer->get_end_iter();
+		$buffer->delete($start, $end);
+		my ($contextl, $contextr) = $get_context->();
+		$buffer->set_text($contextr);
+		if ($cur_replacement <= $#replacements) {
+			$start = $buffer->get_start_iter();
+			$buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
+		}
+		$start = $buffer->get_start_iter();
+		$buffer->insert($start, $contextl);
+	};
+
+	# fill $button_vbox with the word options for the current word
+	my $fill_button_vbox;
+	$fill_button_vbox = sub {
+		$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
+		my $word = $replacements[$cur_replacement]->[1];
+		$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
+		my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
+		foreach my $word_choice (@choices) {
+			my $button = Gtk2::Button->new($word_choice);
+			$button->signal_connect(
+				clicked => sub {
+					$replacements[$cur_replacement]->[1] = $word_choice;
+					$undo->set_sensitive(TRUE);
+					$cur_replacement++;
+					$fill_text_buffer->();
+					if ($cur_replacement > $#replacements) {
+						$button_vbox->foreach(sub {my $child = shift; $child->destroy();});
+						$accept->show;
+						$accept->grab_focus;
+						$wordlabel->set_text("");
+						return;
+					}
+					$fill_button_vbox->();
+				}, $window);
+			$button_vbox->pack_start($button, FALSE, FALSE, 0);
+			$button->show;
+		}
+	};
+
+	$undo->signal_connect(
+		clicked => sub {
+			if ($cur_replacement > 0) {
+				$cur_replacement--;
+				if ($cur_replacement == 0) {
+					$undo->set_sensitive(FALSE);
+				}
+				$replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
+				$fill_button_vbox->();
+				$fill_text_buffer->();
+				$accept->hide;
+				my $word = $replacements[$cur_replacement]->[1];
+				$wordlabel->set_text("Word \"$word\" has multiple replacement options:");
+			}
+		}, $window);
+
+	$accept->signal_connect(
+		clicked => sub {
+			# write the changes to the original $substrings
+			foreach (@replacements) {
+				$substrings->[$_->[0]]->[1] = $_->[1];
+			}
+			$window->destroy;
+		}, $window);
+
+	$stop_button->signal_connect(
+		clicked => sub {
+			$stop = 1;
+			$window->destroy;
+		}, $window);
+
+	$fill_button_vbox->();
+	$fill_text_buffer->();
+
+	Gtk2->main;
+	if ($stop) {
+		die "Processing stopped at line $cur_lineno";
+	}
+}
+
+my $ID = 0;
+my $STRING = 1;
+
+# Parse the configuration file into data type (currently only ID and STRING)
+sub parse_config {
+	my $f = shift;
+	my $fh;
+	if (!open($fh, "<", $f)) {
+		warn "Can't open config file \"$f\"!\n";
+		return undef;
+	}
+	my $line;
+	my @commands;
+	my $state = 0;
+	my $IN_ID = 1;
+	my $IN_STR = 2;
+	my $cur_val = "";
+	while ($line = <$fh>) {
+		chomp($line);
+		$state = 0;
+		push(@commands, []);
+		foreach my $char (split(//, $line)) {
+			if ($char eq "#") {
+				last;
+			} elsif ($char eq '"') {
+				if ($state & $IN_STR) {
+					push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
+					$cur_val = "";
+					$state &= ~$IN_STR;
+				} else {
+					$cur_val = "";
+					$state |= $IN_STR;
+				}
+			} elsif ($char eq " ") {
+				if ($state & $IN_ID) {
+					push(@{$commands[-1]}, {type => $ID, value => $cur_val});
+					$state &= ~$IN_ID;
+					$cur_val = "";
+				} elsif ($state) {
+					$cur_val .= $char;
+				}
+			} else {
+				if (!$state) {
+					$state |= $IN_ID;
+				}
+				$cur_val .= $char;
+			}
+		}
+		if ($state & $IN_STR) {
+			warn "ERROR: Unterminated string in config:\n$line";
+			return undef;
+		} elsif ($cur_val) {
+			push(@{$commands[-1]}, {type => $ID, value => $cur_val});
+			$cur_val = "";
+		}
+		# FIXME: check if this works
+		if ($#{$commands[-1]} == -1) {
+			pop(@commands);
+		}
+	}
+	close($fh);
+
+	return \@commands;
+}
+
+# Load a file of replacement words into a hash table
+sub load_table {
+	my ($filename, $args, $config) = @_;
+	my $fh;
+	# if the paths are relative, find their absolute location based
+	# on the location of the config file
+	if (!file_name_is_absolute $filename) {
+		my $config_dir = dirname $args->{"config"};
+		$filename = rel2abs($filename, $config_dir);
+	}
+	if (!open($fh, "<", $filename)) {
+		warn "Can't open table file \"$filename\"!\n";
+		return undef;
+	}
+	my $line;
+	my @words;
+	my %table;
+	while ($line = <$fh>) {
+		chomp $line;
+		next if (!$line);
+		@words = split(/\Q$config->{tablesep}\E/, $line);
+		if (@words != 2) {
+			warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
+			close $fh;
+			return undef;
+		}
+		my $key = NFD($words[0]);
+		if (exists $table{$key}) {
+			if ($args->{"checkduplicates"}) {
+				warn "WARNING: Duplicate word in file \"$filename\": " .
+					"\"$key\", with replacement \"$words[1]\", " .
+					"already exists with replacement \"$table{$key}\".\n";
+			}
+			$table{$key} = get_unique_words($table{$key} . $config->{choicesep} . $words[1], $config);
+		} else {
+			$table{$key} = $words[1];
+		}
+	}
+	close $fh;
+	return \%table;
+}
+
+# Load table for words to ignore - only the keys matter, since there is no replacement
+sub load_ignore_table {
+	my ($filename, $args) = @_;
+	my $line;
+	my %table;
+	if (!file_name_is_absolute $filename) {
+		my $config_dir = dirname $args->{"config"};
+		$filename = rel2abs($filename, $config_dir);
+	}
+	my $fh;
+	if (!open($fh, "<", $filename)) {
+		warn "Can't open ignore file \"$filename\"!\n";
+		return undef;;
+	}
+	while ($line = <$fh>) {
+		chomp $line;
+		if ($line) {
+			$table{NFD($line)} = "";
+		}
+	}
+	close($fh);
+	return \%table;
+}
+
+# Generate all forms of a word by combining it with endings
+# Returns:
+# 0 - an error occurred
+# 1 - everything's fine
+sub expand_table {
+	my ($cmd, $tables, $config) = @_;
+	my $table_name = $cmd->[1]->{"value"};
+	my $new_table_name = $cmd->[2]->{"value"};
+	my $forms_name = $cmd->[3]->{"value"};
+	if ($new_table_name eq "same") {
+		$new_table_name = $table_name;
+	}
+	if (!exists($tables->{$table_name})) {
+		warn "expand_table: table \"$table_name\" does not exist.\n";
+		return 0;
+	}
+	if (!exists($tables->{$forms_name})) {
+		warn "expand_table: table \"$forms_name\" does not exist.\n";
+		return 0;
+	}
+	my $table = $tables->{$table_name};
+	my $forms = $tables->{$forms_name};
+
+	my $noroot = 0;
+	if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") {
+		$noroot = 1;
+	}
+
+	my %new_table;
+	foreach my $word (keys %$table) {
+		foreach my $ending (keys %$forms) {
+			# Some words and/or endings have multiple options, separated by $config->{choicesep}
+			# These must be temporarily separated in order to properly generate the forms
+			my @word_options;
+			my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
+			my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
+			foreach my $stem_option (@stem_options) {
+				foreach my $ending_option (@ending_options) {
+					push(@word_options, $stem_option . $ending_option);
+				}
+			}
+			$new_table{$word . $ending} = join($config->{choicesep}, @word_options);
+		}
+		if (!$noroot) {
+			$new_table{$word} = $table->{$word};
+		}
+	}
+	$tables->{$new_table_name} = \%new_table;
+	return 1;
+}
+
+# Check if the number and types of arguments given to a config command are right
+# Returns:
+# 0 - an error occurred
+# 1 - everything's fine
+sub check_args {
+	my ($args, $cmd) = @_;
+	my $cmd_name = $cmd->[0]->{"value"};
+	if ($#$cmd - 1 < $#$args) {
+		my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
+		foreach my $arg (@{$cmd}[1..$#$cmd]) {
+			$err .= " " . $arg->{"value"}
+		}
+		warn "$err\n";
+		return 0;
+	}
+	my $arg_num = 0;
+	while ($arg_num <= $#$args) {
+		if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
+			my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
+			$err .= "Expected:";
+			foreach my $arg_type (@$args) {
+				$err .= " ID" if ($arg_type == $ID);
+				$err .= " STRING" if ($arg_type == $STRING);
+			}
+			$err .= "\nReceived:";
+			foreach my $arg (@{$cmd}[1..$#$cmd]) {
+				$err .= " ID" if ($arg->{"type"} == $ID);
+				$err .= " STRING" if ($arg->{"type"} == $STRING);
+			}
+			warn "$err\n";
+			return 0;
+		}
+		$arg_num++;
+	}
+	return 1;
+}
+
+# Interpret the config file - load and expand tables, etc.
+# $config_list - the list returned by parse_config
+sub interpret_config {
+	my ($config_list, $args) = @_;
+	my %tables;
+	my %config;
+	$config{"table_paths"} = {};
+	$config{"ignore"} = "";
+	$config{"ignore_words"} = {};
+	$config{"split"} = "\\s";
+	$config{"beforeword"} = "\\s";
+	$config{"afterword"} = "\\s";
+	$config{"tablesep"} = "\t";
+	$config{"choicesep"} = "\$";
+	$config{"replacements"} = [];
+	my %tmp_table_paths;
+	my %mandatory_args = (
+		"ignore" => [$STRING],
+		"table" => [$ID],
+		"expand" => [$ID, $ID, $ID],
+		"match" => [$STRING, $STRING],
+		"matchignore" => [$STRING],
+		"replace" => [$ID],
+		"split" => [$STRING],
+		"beforeword" => [$STRING],
+		"afterword" => [$STRING],
+		"tablesep" => [$STRING],
+		"choicesep" => [$STRING],
+		"group" => [],
+		"endgroup" => []
+	);
+	my $in_group = 0;
+	foreach my $cmd (@$config_list) {
+		# All load statements must be before expand statements
+		# All expand, beforeword, and afterword statements must be before replace statements
+		if ($cmd->[0]->{"type"} == $ID) {
+			if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
+				warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
+				return undef;
+			}
+			if (!check_args($mandatory_args{$cmd->[0]->{"value"}}, $cmd)) {
+				return undef;
+			}
+			if ($cmd->[0]->{"value"} eq "table") {
+				my $table = load_table $cmd->[2]->{"value"}, $args, \%config;
+				if (!$table) {
+					return undef;
+				}
+				$tables{$cmd->[1]->{"value"}} = $table;
+				$tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "expand") {
+				# FIXME: need to handle table paths when a new table name is used for the expansion
+				if (!expand_table($cmd, \%tables, \%config)) {
+					return undef;
+				}
+			} elsif ($cmd->[0]->{"value"} eq "group") {
+				if ($in_group) {
+					warn "ERROR: New group started without ending last one in config\n";
+					return undef;
+				}
+				push @{$config{"replacements"}}, {"type" => "group", "words" => {}};
+				for (1..$#$cmd) {
+					$config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+				}
+				$in_group = 1;
+			} elsif ($cmd->[0]->{"value"} eq "endgroup") {
+				if (!$in_group) {
+					warn "ERROR: endgroup command called while not in group\n";
+					return undef;
+				}
+				$in_group = 0;
+			} elsif ($cmd->[0]->{"value"} eq "match") {
+				if ($in_group) {
+					warn "ERROR: match command is invalid inside group\n";
+					return undef;
+				}
+				push @{$config{"replacements"}}, {
+					"type" => "match",
+					"search" => NFD($cmd->[1]->{"value"}),
+					"replace" => $cmd->[2]->{"value"}};
+				for (3..$#$cmd) {
+					$config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+				}
+			} elsif ($cmd->[0]->{"value"} eq "matchignore") {
+				if ($in_group) {
+					warn "ERROR: matchignore command is invalid inside group\n";
+					return undef;
+				}
+				push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})};
+				for (2..$#$cmd) {
+					$config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+				}
+			} elsif ($cmd->[0]->{"value"} eq "replace") {
+				if (!$in_group) {
+					warn "ERROR: replace command called while not in group\n";
+					return undef;
+				}
+				my $table = $cmd->[1]->{"value"};
+				if (!exists($tables{$table})) {
+					warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
+					return undef;
+				}
+				if (exists($tmp_table_paths{$table})) {
+					$config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table];
+				}
+				# Note: we don't need to check if $table{"choicesep"} was defined
+				# here since we can't ever get this far without first having
+				# loaded a table anyways
+				add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $tables{$table}, $args, \%config);
+			} elsif ($cmd->[0]->{"value"} eq "split") {
+				$config{"split"} = $cmd->[1]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "beforeword") {
+				$config{"beforeword"} = $cmd->[1]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "afterword") {
+				$config{"afterword"} = $cmd->[1]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "tablesep") {
+				$config{"tablesep"} = $cmd->[1]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "choicesep") {
+				$config{"choicesep"} = $cmd->[1]->{"value"};
+			} elsif ($cmd->[0]->{"value"} eq "ignore") {
+				$config{"ignore"} = $cmd->[1]->{"value"};
+				my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
+				if (!$table) {
+					return undef;
+				}
+				$config{"ignore_words"} = $table;
+			} else {
+				warn "ERROR: unknown command \"" . $cmd->[0]->{"value"} . "\" in config.\n";
+				return undef;
+			}
+		} else {
+			my $err =  "ERROR: line does not start with command:\n";
+			foreach my $cmd_part (@$cmd) {
+				$err .= $cmd_part->{"value"};
+			}
+			warn "$err\n";
+			return undef;
+		}
+	}
+	if ($in_group) {
+		warn "ERROR: unclosed group in config\n";
+		return undef;
+	}
+	if (!$config{"ignore"}) {
+		warn "ERROR: no file of words to ignore specified.\n";
+		return undef;
+	}
+	return \%config;
+}
+
+# load the config file
+# Returns:
+# the config hash or undef if an error occurred
+sub load_config {
+	my $args = shift;
+	my $config_list = parse_config($args->{"config"});
+	if (!$config_list) {
+		return undef;
+	}
+	return interpret_config $config_list, $args;
+}
+
+# Handle the action returned by `prompt_unknown_word`
+# $config - the current program config
+# $args - the command line arguments
+# Currently accepted values for $action:
+# ["ignore", "run", $word] - only ignore $word for the rest of this run
+# ["ignore", "permanent", $word] - write $word to the permanent ignore file
+# ["add", $word, $replace_word, $table_path] - add $word to the table 
+# 	corresponding to $table_path with $replace_word as its replacement. Note that
+# 	only tables directly corresponding to paths work here - tables that only
+# 	were created through "expand" in the config aren't ever shown separately
+# 	in `prompt_unknown_word`
+# ["reload"] - reload the configuration file
+# Returns:
+# 0 - nothing needs to be done
+# 1 - the current line needs to be re-transliterated with the new config
+# 2 - an error occurred while reloading the config
+sub handle_unknown_word_action {
+	my ($action, $config, $args) = @_;
+	if ($action->[0] eq "ignore") {
+		$config->{"ignore_words"}->{$action->[2]} = "";
+		if ($action->[1] eq "permanent") {
+			my $fh;
+			if (!open($fh, ">>", $config->{"ignore"})) {
+				warn "ERROR: Can't open ignore file for appending.\n";
+				return 1;
+			}
+			print($fh $action->[2] . "\n");
+			close($fh);
+		} elsif ($action->[1] eq "run") {
+			# Print to error file if ignore isn't permanent
+			return 0 if ($args->{"errors"} eq "");
+			my $fh;
+			if (!open($fh, ">>", $args->{"errors"})) {
+				warn "ERROR: Can't open error file \"$args->{errors}\".\n";
+				return 0;
+			}
+			print($fh $action->[2] . "\n");
+			close($fh);
+		}
+		return 0;
+	} elsif ($action->[0] eq "add") {
+		my $table_path = $action->[3];
+		my $word = $action->[1];
+		my $replace_word = $action->[2];
+		my $fh;
+		if (!open($fh, ">>", $table_path)) {
+			warn "ERROR: Can't open table file \"$table_path\" for appending.\n";
+			return 1;
+		}
+		print($fh $word . "," . $replace_word . "\n");
+		close($fh);
+		my $replacement_id = $config->{"table_paths"}->{$table_path}->[0];
+		my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
+		add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word});
+		return 1;
+	} elsif ($action->[0] eq "reload") {
+		my $new_config = load_config $args;
+		if ($new_config) {
+			%$config = %$new_config;
+			return 1;
+		} else {
+			return 2;
+		}
+	}
+}
+
+# Split $substrings into single words based on the "split" option
+# in $config.
+# $substrings can already be split at this point; only the
+# ones that haven't been transliterated yet are modified
+sub split_words {
+	my ($config, $substrings) = @_;
+	my @substrings_new;
+	my $split_re = qr/($config->{"split"})/;
+	foreach my $cur_substr (@$substrings) {
+		if ($cur_substr->[0] == 1) {
+			push(@substrings_new, $cur_substr);
+			next;
+		}
+
+		my @words = split(/$split_re/, $cur_substr->[1]);
+		for my $i (0..$#words) {
+			# Word is not delimiter
+			# Split produces an empty field at the beginning if the string
+			# starts with the delimiter
+			if ($i % 2 == 0) {
+				push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
+			} else {
+				# Delimiters can count as already replaced
+				push(@substrings_new, [1, $words[$i], $words[$i]]);
+			}
+		}
+	}
+	@$substrings = @substrings_new;
+}
+
+# small helper function to add a untransliterated string to the last substring
+# if that is not transliterated yet, or push it onto @$substrings otherwise
+# -> used to keep all untransliterated text in one piece
+# since this is also used for the "nofinal" attribute on "match", it takes
+# an original and replaced string (since, when using "match" and "nofinal",
+# the original string was replaced, but is still marked as unknown)
+sub push_unknown {
+	my ($substrings, $orig, $replaced) = @_;
+	$replaced //= $orig;
+	if (@$substrings && !$substrings->[-1]->[0]) {
+		$substrings->[-1]->[1] .= $replaced;
+		$substrings->[-1]->[2] .= $orig;
+	} else {
+		push(@$substrings, [0, $replaced, $orig]);
+	}
+}
+
+# Replace a word in $substrings based on $replace_config using regex
+# $replace_config->{"search"} is the word to replace
+# $replace_config->{"replace"} is the replacement word
+#	if $replace_config->{"replace"} is undefined, just splits
+#	$substrings at the the match and marks that the match has
+#	been transliterated - currently used for "matchignore"
+# $replace_config->{"beginword"}, $replace_config->{"afterword"} -
+#	specifies if the match is only valid when $config->{"beforeword"}
+#	or $config->{"afterword"} occur before or after it, respectively
+sub replace_match {
+	my ($config, $replace_config, $substrings) = @_;
+	my $beginword = exists $replace_config->{"beginword"};
+	my $endword = exists $replace_config->{"endword"};
+	my $fullword = $beginword && $endword;
+	my $beforeword = $config->{"beforeword"};
+	my $afterword = $config->{"afterword"};
+	my $word = $replace_config->{"search"};
+	my $replace_word = $replace_config->{"replace"};
+	if ($fullword) {
+		$word = qr/(\A|$beforeword)$word(\z|$afterword)/;
+	} elsif ($beginword) {
+		$word = qr/(\A|$beforeword)$word/;
+	} elsif ($endword) {
+		$word = qr/$word(\z|$afterword)/;
+	} else {
+		$word = qr/$word/;
+	}
+
+	my @substrings_new;
+	# only modify $substrings at all if the word was found
+	my $found_word = 0;
+	my $last_idx;
+	# @substrings_new is only used if needed to improve efficiency
+	foreach my $i (0..$#$substrings) {
+		if ($substrings->[$i]->[0]) {
+			# FIXME: is there a way to make it more efficient by keeping the old array?
+			# This is a major bottleneck
+			# Note: the above statement *may* be a bit exaggerated
+			if ($found_word) {
+				push(@substrings_new, $substrings->[$i]);
+			}
+			next;
+		}
+		$last_idx = 0;
+		my $i0 = 0;
+		my $i1 = 0;
+		while ($substrings->[$i]->[1] =~ m/$word/g) {
+			if (!$found_word) {
+				$found_word = 1;
+				if ($i != 0) {
+					push(@substrings_new, @{$substrings}[0..$i-1]);
+				}
+			}
+			# This mess is needed to reliably match $beforeword and $afterword and put the captured
+			# "splitting" characters back into the text. This would be much easier just using
+			# a lookbehind and lookahead, but I couldn't find a way to also match beginning and
+			# end of string that way.
+			$i0 = $-[0];
+			$i1 = $+[0];
+			if ($fullword) {
+				$i0 += length($1);
+				$i1 -= length($2);
+				# pos need to be decreased so that matches still work right next to each other
+				pos($substrings->[$i]->[1]) -= length($2);
+			} elsif ($beginword) {
+				$i0 += length($1);
+			} elsif ($endword) {
+				$i1 -= length($1);
+				pos($substrings->[$i]->[1]) -= length($1);
+			}
+			if ($last_idx != $i0) {
+				my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
+				push_unknown \@substrings_new, $unknown;
+			}
+			my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
+			my $replace_str = $replace_word // $orig_str;
+			if ($replace_config->{"nofinal"}) {
+				push_unknown \@substrings_new, $orig_str, $replace_str;
+			} else {
+				push(@substrings_new, [1, $replace_str, $orig_str]);
+			}
+			$last_idx = $i1;
+		}
+		if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
+			my $unknown = substr($substrings->[$i]->[1], $last_idx);
+			push_unknown \@substrings_new, $unknown;
+		}
+	}
+	if ($found_word) {
+		@$substrings = @substrings_new;
+	}
+}
+
+# Replace a group, i.e. replace all the words in a trie
+# $replace_config->{"words"} - the root node of the trie
+# $replace_config->{"beginword"}, $replace_config->{"endword"} -
+#	same as in `replace_match`
+sub replace_group {
+	my ($config, $replace_config, $substrings) = @_;
+	my @substrings_new;
+	# Recurse backwords towards the root node of the trie to find the first
+	# node with a key "final" which satisfies the ending condition (if "endword" is set)
+	# Returns the id *after* the last match and the node that was found
+	# with the key "final" (or undef, if nothing matched)
+	my $find_final = sub {
+		my ($i, $tmp_cur_node, $s) = @_;
+		do {
+			my $after_ch = substr($s->[1], $i, 1);
+			if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) ||
+				$after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
+				return ($i, $tmp_cur_node);
+			}
+			$i--;
+		} while ($tmp_cur_node = $tmp_cur_node->{"parent"});
+		# none of the points were appropriate for breaking the word, so
+		# $tmp_cur_node now points to the nonexistent parent node of the
+		# root node
+		return ($i, undef);
+	};
+	foreach my $s (@$substrings) {
+		if ($s->[0]) {
+			push(@substrings_new, $s);
+			next;
+		}
+		my $cur_node = $replace_config->{"words"};
+		my $start_i = 0;
+		my $i = 0;
+		# This deliberately goes off the end of the string! $cur_node is always "one behind" $i
+		# since the node is only advanced in the iteration *after* $i has increased, meaning that
+		# $i has to already be after the end of the string for the first if statement to definitely
+		# fail, causing the elsif statement to handle that case
+		while ($i <= length($s->[1])) {
+			# This works even when $i is one index after the end of the string - it just returns "" then
+			my $ch = substr($s->[1], $i, 1);
+			if (exists $cur_node->{$ch}) {
+				if ($cur_node == $replace_config->{"words"}) {
+					my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
+					if (exists($replace_config->{"beginword"}) &&
+						$before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
+						push_unknown \@substrings_new, $ch;
+						$i++;
+						next;
+					}
+					$start_i = $i;
+				}
+				$cur_node = $cur_node->{$ch};
+			} elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
+				my $tmp_cur_node = $cur_node;
+				($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
+				if (!defined($tmp_cur_node)) {
+					push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
+					$i += 2;
+				} else {
+					push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]);
+				}
+				$cur_node = $replace_config->{"words"};
+				next;
+			} else {
+				push_unknown \@substrings_new, $ch;
+			}
+			$i++;
+		}
+	}
+	@$substrings = @substrings_new;
+}
+
+# Perform all replacements on $line based on $config
+# $substrings: array of arrays - each one has three elements:
+# first 0 or 1, indicating if the substring following it has already
+# been replaced or not (1 means it has been replaced), then the
+# transliterated string, and lastly the original string.
+# If the first element is 0, the second two elements are obviously same
+sub replace_line {
+	my ($config, $line) = @_;
+	my $substrings = [[0, $line, $line]];
+	foreach my $replacement (@{$config->{"replacements"}}) {
+		if ($replacement->{"type"} eq "match") {
+			replace_match($config, $replacement, $substrings);
+		} elsif ($replacement->{"type"} eq "group") {
+			replace_group($config, $replacement, $substrings);
+		}
+	}
+	# splits all words at the end so that the splitting characters
+	# aren't taken as unknown words and the unknown words are (hopefully)
+	# in better chunks for prompting the user about them
+	split_words($config, $substrings);
+
+	return $substrings;
+}
+
+# NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
+# If an ignore word is added which is attached to a word that should have a replacement
+# added and just that word is selected to ignore, you never get a chance to add a
+# replacement for the other word that it is attached to
+
+# Handle unknown words
+# $substrings - the current substrings with unknown words
+# $config - the program config
+# $args - the command line args
+# $cur_lineno - display string to show the user the current line number
+# Returns:
+# 1 - the current line must be replaced again with the new config
+# 0 - everything's fine
+sub get_unknown_words {
+	my ($substrings, $config, $args, $cur_lineno) = @_;
+	foreach my $i (0..$#$substrings) {
+		my $word = $substrings->[$i];
+		if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
+			my $contextl = "";
+			my $contextl_orig = "";
+			foreach my $j (0..$i-1) {
+				$contextl .= $substrings->[$j]->[1];
+				$contextl_orig .= $substrings->[$j]->[2];
+			}
+			my $contextr = "";
+			my $contextr_orig = "";
+			foreach my $j ($i+1..$#$substrings) {
+				$contextr .= $substrings->[$j]->[1];
+				$contextr_orig .= $substrings->[$j]->[2];
+			}
+			my $action = prompt_unknown_word($contextl, $contextl_orig,
+				$word->[1], $contextr, $contextr_orig,
+				$config->{"table_paths"}, "$cur_lineno"
+			);
+			# if $ret == 2, config could not be loaded
+			# if $ret == 1, line must be redone with new config
+			my $ret = handle_unknown_word_action($action, $config, $args);
+			# keep retrying until the user chooses an action which
+			# didn't throw an error
+			while ($ret == 2) {
+				$action = prompt_unknown_word($contextl, $contextl_orig,
+					$word->[1], $contextr, $contextr_orig,
+					$config->{"table_paths"}, "$cur_lineno", 1);
+				$ret = handle_unknown_word_action($action, $config, $args);
+			}
+			# re-transliterate the line with the new config
+			if ($ret == 1) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+# Main replacement function
+# Opens the input file ($args->{"input"}) and writes the transliterated text
+# to the file handle $outputfh, prompting the user for unknown words or
+# word choices (if those aren't disabled on the command line)
+sub replace {
+	my ($config, $args, $outputfh) = @_;
+	# Is there *really* no more efficient way to get the total number of lines?
+	open my $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
+	my $total_lines = 0;
+	while (<$fh>) {$total_lines++};
+	close $fh;
+	open $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
+	while (my $line = <$fh>) {
+		next if $. < $args->{"start"};
+		my $nfd_line = NFD($line);
+		my $substrings = replace_line($config, $nfd_line);
+
+		if (!$args->{"nounknowns"}) {
+			# re-transliterate the string if the config was reloaded
+			while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
+				$substrings = replace_line($config, $nfd_line);
+			}
+		} elsif ($args->{"debug"}) {
+			foreach my $s (@$substrings) {
+				if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
+					warn "Unknown word: \"$s->[1]\"\n";
+				}
+			}
+		}
+		if (!$args->{"nochoices"}) {
+			prompt_choose_word($substrings, "$./$total_lines", $config);
+		} elsif ($args->{"debug"}) {
+			foreach my $s (@$substrings) {
+				if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
+					my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
+					warn "Word \"$s->[1]\" with $num_choices word choices.\n";
+				}
+			}
+		}
+
+		foreach (@$substrings) {
+			print $outputfh $_->[1];
+		}
+	}
+	close $fh;
+}
+
+my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
+GetOptions(
+	\%args, "debug",
+	"nochoices", "nounknowns",
+	"force", "start=i",
+	"output=s", "config=s",
+	"errors=s", "help",
+	"checkduplicates") or pod2usage(1);
+
+pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
+pod2usage(1) if $#ARGV != 0 && !$args{"checkduplicates"};
+
+if (!-f $args{"config"}) {
+	die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
+}
+my $config = load_config \%args;
+if (!$config) {
+	die "ERROR: Invalid config\n";
+}
+exit 0 if ($args{"checkduplicates"});
+
+my $input = $ARGV[0];
+if (!-f $input) {
+	die "ERROR: input file \"$input\" does not exist or is not a file.\n";
+}
+$args{"input"} = $input;
+
+if (-f $args{"errors"} && !$args{"force"}) {
+	my $choice = "";
+	while ($choice !~ /^[yn]$/) {
+		print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
+		$choice = <STDIN>;
+		chomp $choice;
+	}
+	die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
+}
+
+my $outputfh;
+if ($args{"output"} eq "") {
+	warn "WARNING: no output file supplied; printing to STDOUT\n";
+	open $outputfh, ">&STDOUT";
+} elsif (-f $args{"output"} && !$args{"force"}) {
+	my $choice = "";
+	while ($choice !~ /^[aoe]$/) {
+		print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
+		$choice = <STDIN>;
+		chomp $choice;
+	}
+	if ($choice eq "a") {
+		open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+	} elsif ($choice eq "e") {
+		die "ERROR: \"$args{output}\" already exists.\n";
+	} else {
+		open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+	}
+} else {
+	open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+}
+
+replace($config, \%args, $outputfh);
+close $outputfh;
+
+__END__
+
+=head1 NAME
+
+transliterate.pl - Transliterate text files
+
+=head1 SYNOPSIS
+
+transliterate.pl [options][input file]
+
+Start the transliteration engine with the given file as input.
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<< --output <filename> >>
+
+Sets the output file to print to.
+
+If the file exists already and C<--force> is not set, the user is asked
+if the file should be overwritten or appended to.
+
+B<Default:> STDOUT (print to terminal)
+
+=item B<< --config <filename> >>
+
+Sets the configuration file to use.
+
+B<Default:> "config"
+
+=item B<--checkduplicates>
+
+Prints all duplicate words within single table files and across tables
+that are replaced within the same group, then exits the program.
+
+Note that this simply prints B<all> duplicates, even ones that are
+legitimate. When duplicates are found during normal operation of
+the program, they are simply combined in exactly the same way as the
+regular word choices.
+
+=item B<--nochoices>
+
+Disables prompting for the right word when multiple replacement words exist.
+
+This can be used to "weed out" all the unknown words before
+commencing the laborious task of choosing the right word every time
+multiple options exist.
+
+=item B<--nounknowns>
+
+Disables prompting for the right word when a word is not found in the database.
+
+This can be used together with B<--nochoices> to perform a quick test of how
+well the actual engine is working without having to click through all the
+prompts.
+
+=item B<--debug>
+
+This option is only useful for automatic testing of the transliteration engine.
+
+If C<--nochoices> is enabled, each word in the input with multiple choices will
+be output, along with the number of choices (can be used to test the proper
+functioning of C<choicesep> in the config file).
+
+If C<--nounknowns> is enabled, each unknown word in the input is printed
+(can be used to test that the C<ignore> options are working correctly).
+
+=item B<--force>
+
+Always overwrites the output and error file without asking.
+
+=item B<< --start <line number> >>
+
+Starts at the given line number instead of the beginning of the file.
+
+Note: when "Stop processing" is pressed, the current line number is
+printed out. This is the current line that was being processed, so it
+has not been printed to the output file yet and thus the program must
+be resumed at that line, not the one afterwards.
+
+=item B<< --errors <filename> >>
+
+Specifies a file to write errors in. Note that this does not refer to
+actual errors, but to any words that were temporarily ignored
+(i.e. words for which "Ignore: This run" was clicked).
+
+If no file is specified, nothing is written. If a file is specified
+that already exists and C<--force> is not set, the user is prompted
+for action.
+
+=item B<--help>
+
+Display the full documentation.
+
+=back
+
+=head1 DESCRIPTION
+
+B<transliterate.pl> will read the given input file and transliterate it
+based on the given configuration file, prompting the user for action if
+a word has multiple replacement options or is not found in the database.
+
+See L</"CONFIGURATION"> for details on what is possible.
+
+=head1 WORD CHOICE WINDOW
+
+The word choice window is opened any time one word has multiple replacement
+options and prompts the user to choose one.
+
+For each word with multiple options, the user must choose the right option
+and then press "Accept changes" to finalize the transliteration of the
+current line. Before the line is finalized, the user may press "Undo" to
+undo any changes on the current line.
+
+"Stop processing" will exit the program and print the line number that was
+currently being processed.
+
+=head1 UNKNOWN WORD WINDOW
+
+The unknown word window is opened any time a word could not be replaced.
+
+Both the context from the original language and the context from the
+transliterated version (so far) is shown. If a part of the text is
+selected in one of the text boxes and "Use selection as word" is
+pressed for the appropriate box, the selected text is used for the
+action that is taken subsequently.
+
+The possible actions are:
+
+=over 8
+
+=item Ignore
+
+"This run" only ignores the word until the program exist, while
+"Permanently" saves the word in the ignore file specified in the
+configuration.
+
+=item Add to list
+
+Adds the word typed in the text box beside "Add replacement" to the
+selected table file and re-runs the replacement on the current line.
+
+The filtering for which table files are actually shown here is
+currently a bit rudimentary. First, all paths that are used in the
+C<table> statements in the config are put into a list. Then, the
+paths corresponding to any tables used for word endings in the
+C<expand> statements are removed from the list, and that is what
+is shown in this window. The reason for removing those paths from
+the list is that it gets somewhat confusing when all the tables
+that are only used for word endings are also in the list, and it
+is very unlikely that an unknown word would need to be written to
+one of those files. If necessary, the word can always be added
+manually and the config reloaded. Note also that only actual
+table paths are shown here, not the tables themselves - this is
+not necessarily a one-to-one mapping since new tables can be
+generated with the C<expand> statements.
+
+=item Reload config
+
+Reload the configuration file along with all tables an re-runs the
+replacement on the current line. Note that this can take a short
+while since the entire word database has to be reloaded.
+
+=item Stop processing
+
+Prints the current line number to the terminal and exits the program.
+
+=back
+
+=head1 CONFIGURATION
+
+These are the commands accepted in the configuration file.
+Any parameters in square brackets are optional.
+
+The C<match>, C<matchignore>, and C<replace> commands are executed in
+the order they are specified, except that all C<replace> commands within
+the same group are replaced together.
+
+Note that any duplicate words found will cause the user to be prompted
+to choose one option every time the word is replaced in the input text.
+
+=over 8
+
+=item B<split> <regex string>
+
+Sets the RegEx string to be used for splitting words. This is only used
+for splitting the words which couldn't be replaced after all replacement
+has been done, before prompting the user for unknown words.
+
+Note that C<split> should probably always contain at least C<\n>, since
+otherwise all of the newlines will be marked as unknown words. Usually,
+this will be included anyways through C<\s>.
+
+Note also that C<split> should probably include the C<+> RegEx-quantifier
+since that allows the splitting function in the end to ignore several
+splitting characters right after each other (e.g. several spaces) in one
+go instead of splitting the string again for every single one of them.
+This shouldn't actually make any difference functionality-wise, though.
+
+B<Default:> \s (all whitespace)
+
+=item B<beforeword> <regex string>
+
+Sets the RegEx string to be matched before a word if C<beginword> is set.
+
+B<Default:> \s
+
+=item B<afterword> <regex string>
+
+Sets the RegEx string to be matched after a word if C<endword> is set.
+
+Note that C<afterword> should probably always contain at least C<\n>,
+since otherwise words with C<endword> set will not be matched at the
+end of a line.
+
+C<beforeword> and C<afterword> will often be exactly the same, but
+they are left as separate options in case more fine-tuning is needed.
+
+B<Default:> \s
+
+=item B<tablesep> <string>
+
+Sets the separator used to split the lines in the table files into the
+original and replacement word.
+
+B<Default:> Tab
+
+=item B<choicesep> <string>
+
+Sets the separator used to split replacement words into multiple choices for
+prompting the user.
+
+B<Default:> $
+
+=item B<ignore> <filename>
+
+Sets the file of words to ignore.
+
+This has to be set even if the file is just empty because the user can
+add words to it from the unknown word window.
+
+=item B<table> <table identifier> <filename>
+
+Load the table from C<< <filename> >>, making it available for later use in the
+C<expand> and C<replace> commands using the identifier C<< <table identifier> >>.
+
+Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
+to the location of the configuration file.
+
+The table files simply consist of C<tablesep>-separated values, with the word in the
+original language first and the replacement word second. The replacement word
+can optionally have several parts separated by C<choicesep>, which will cause the
+user to be prompted to choose one of the options.
+
+=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot]
+
+Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
+the word endings in C<< <word ending table> >>, saving the result as a table with the
+identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table
+identifier> >>, the original C<< <table identifier> >> is used instead.
+
+If C<noroot> is set, the root forms of the words are not kept.
+
+If the replacement for a word ending contains C<choicesep>, it is split and each part
+is combined with the root form separately and the user is prompted to choose one of
+the options later. it is thus possible to allow multiple choices for the ending if
+there is a distinction in the replacement language but not in the source language.
+Note that each of the root words is also split into its choices (if necessary)
+during the expanding, so it is possible to use C<choicesep> in both the endings
+and root words.
+
+Note that the paths of all tables used for <word ending table> are removed from the
+list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD
+WINDOW"> for details.
+
+=item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
+
+Perform a RegEx match using the given C<< <regex string> >>, replacing it with
+C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
+(e.g. groups) in it. C<beginword> and C<endword> specify whether the match must
+be at the beginning or ending of a word, respectively, using the RegEx specified
+in C<beforeword> and C<afterword>. If C<nofinal> is set, the string is not marked
+as transliterated after the replacement, allowing it to be modified by subsequent
+C<match> or C<replace> commands.
+
+=item B<matchignore> <regex string> [beginword] [endword]
+
+Performs a RegEx match in the same manner as C<match>, except that the original
+match is used as the replacement instead of specifying a replacement string.
+
+=item B<group> [beginword] [endword]
+
+Begins a replacement group. All C<replace> commands must occur between C<group>
+and C<endgroup>, since they are then grouped together and replaced in one go.
+C<beginword> and C<endword> act in the same way as specified for C<match> and
+apply to all C<replace> statements in this group.
+
+=item B<replace> <table identifier>
+
+Replace all words in the table with the identifier C<< <table identifier> >>,
+using the C<beginword> and C<endword> settings specified by the current group.
+
+Note that a table must have been loaded (or generated using C<expand>)
+before being used in a C<replace> statement.
+
+=item B<endgroup>
+
+End a replacement group.
+
+=back
+
+=head1 SEE ALSO
+
+perlre, perlretut
+
+=head1 LICENSE
+
+Copyright (c) 2019, 2020 lumidify <nobody[at]lumidify.org>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+=cut

	transliterate Transliteration engine
	git clone git://lumidify.org/git/transliterate.git
	Log \| Files \| Refs \| README

A	.gitignore	\|	1	+
A	README	\|	5	+++++
A	tests/alltests.sh	\|	4	++++
A	tests/runtest.sh	\|	13	+++++++++++++
A	tests/test1/config	\|	15	+++++++++++++++
A	tests/test1/descr.txt	\|	1	+
A	tests/test1/err.txt	\|	6	++++++
A	tests/test1/expected.txt	\|	6	++++++
A	tests/test1/input.txt	\|	6	++++++
A	tests/test2/config	\|	19	+++++++++++++++++++
A	tests/test2/descr.txt	\|	1	+
A	tests/test2/err.txt	\|	3	+++
A	tests/test2/expected.txt	\|	2	++
A	tests/test2/input.txt	\|	2	++
A	transliterate.pl	\|	1605	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++