commit 410d20484d0ceda89cfcad455b727bac02d071d9
Author: lumidify <nobody@lumidify.org>
Date: Thu, 26 Mar 2020 07:37:37 +0100
Copy data from old repository
Diffstat:
15 files changed, 1689 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/README b/README
@@ -0,0 +1,5 @@
+See the perldoc in transliterate.pl for documentation (run perldoc -F transliterate.pl).
+
+The git history on this repository is not complete because the original
+repository contained a lot of private information that I didn't want to
+make available here.
diff --git a/tests/alltests.sh b/tests/alltests.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+./runtest.sh test1
+./runtest.sh test2
diff --git a/tests/runtest.sh b/tests/runtest.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1
+diff "$1/expected.txt" runtest.txt > /dev/null
+err1=$?
+diff "$1/err.txt" runtest_err.txt > /dev/null
+if [ $? -eq 0 -a $err1 -eq 0 ]; then
+ echo "OK `cat $1/descr.txt`"
+else
+ echo "FAILED `cat $1/descr.txt`"
+fi
+rm runtest.txt
+rm runtest_err.txt
diff --git a/tests/test1/config b/tests/test1/config
@@ -0,0 +1,15 @@
+split "[ \n]"
+beforeword " "
+afterword "[ \n]"
+
+ignore "../data/ignore.txt"
+table words "../data/words.txt"
+table endings "../data/endings.txt"
+
+expand words same endings
+
+match "\d+" "num_replaced" beginword
+
+group beginword endword
+replace words
+endgroup
diff --git a/tests/test1/descr.txt b/tests/test1/descr.txt
@@ -0,0 +1 @@
+Basic test
diff --git a/tests/test1/err.txt b/tests/test1/err.txt
@@ -0,0 +1,6 @@
+Unknown word: "word20"
+Unknown word: "word01231"
+Word "word0_replaced$word0_replaced2" with 2 word choices.
+Unknown word: "aword1"
+Unknown word: "end3"
+Word "word0_replacedend3_replaced$word0_replaced2end3_replaced" with 2 word choices.
diff --git a/tests/test1/expected.txt b/tests/test1/expected.txt
@@ -0,0 +1,6 @@
+word1_replaced word2_replaced
+num_replacedword1_replaced word9_replaced num_replaced word4_replaced
+word20 word01231 word0_replaced$word0_replaced2
+aword1
+word1_replacedend1_replaced word0_replacedend3_replaced$word0_replaced2end3_replaced end3
+num_replacedword2_replacedend1_replaced
diff --git a/tests/test1/input.txt b/tests/test1/input.txt
@@ -0,0 +1,6 @@
+word1 word2
+123word1 word9 123 word4
+word20 word01231 word0
+aword1
+word1end1 word0end3 end3
+432word2end1
diff --git a/tests/test2/config b/tests/test2/config
@@ -0,0 +1,19 @@
+split "[ \n]+"
+beforeword " "
+afterword "[ \n]"
+tablesep ","
+choicesep "|"
+
+ignore "../data/ignore.txt"
+table words "../data/words1.txt"
+table endings "../data/endings_choices.txt"
+
+expand words other_name endings
+
+match "-\d" "-r" endword
+match "w ord" "word" beginword nofinal
+matchignore "\dhi\d"
+
+group beginword endword
+replace other_name
+endgroup
diff --git a/tests/test2/descr.txt b/tests/test2/descr.txt
@@ -0,0 +1 @@
+Different tablesep and linesep; match nofinal; ending choices
diff --git a/tests/test2/err.txt b/tests/test2/err.txt
@@ -0,0 +1,3 @@
+Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices.
+Unknown word: "4"
+Word "word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2" with 4 word choices.
diff --git a/tests/test2/expected.txt b/tests/test2/expected.txt
@@ -0,0 +1,2 @@
+5hi3 word9_replaced-r word1_replacedend1r1|word1_replacedend1r2-r
+45hi1 word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2
diff --git a/tests/test2/input.txt b/tests/test2/input.txt
@@ -0,0 +1,2 @@
+5hi3 w ord9-0 w ord1end1-1
+45hi1 word0end1
diff --git a/transliterate.pl b/transliterate.pl
@@ -0,0 +1,1605 @@
+#!/usr/bin/env perl
+
+# Proudly written using vi (OpenBSD nvi)
+
+# NOTE: If you're wondering why the error codes used by the functions are so
+# inconsistent, go ask my former self
+
+use strict;
+use warnings;
+use utf8;
+use feature 'unicode_strings';
+use open qw< :encoding(UTF-8) >;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+use Unicode::Normalize;
+use Glib qw/TRUE FALSE/;
+use Gtk2 '-init';
+use Getopt::Long;
+use Pod::Usage;
+use Scalar::Util qw(weaken);
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(rel2abs file_name_is_absolute);
+
+# takes a string of words separated by '$' and returns a new string in the
+# same format with duplicates removed
+sub get_unique_words {
+ my ($word, $config) = @_;
+ my %tmp;
+ my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word;
+ return join $config->{choicesep}, @words_uniq;
+}
+
+# Adds all words in $words to $trie
+# Automatically combines duplicate words with "$" inbetween
+sub add_to_trie {
+ my ($table_name, $trie, $words, $args, $config) = @_;
+ foreach my $word (keys %$words) {
+ my $cur_node = $trie;
+ foreach my $char (split //, $word) {
+ if (!exists($cur_node->{$char})) {
+ $cur_node->{$char}->{"parent"} = $cur_node;
+ # This is required to avoid circular references
+ # (otherwise, the garbage collector doesn't ever
+ # destroy these nodes, leading to the memory
+ # consumption growing without restraint if
+ # "Reload config" is used)
+ weaken($cur_node->{$char}->{"parent"});
+ }
+ $cur_node = $cur_node->{$char};
+ }
+ if (exists($cur_node->{"final"})) {
+ if ($args->{"checkduplicates"}) {
+ warn "WARNING: Duplicate word \"$word\". Last occurrence as " .
+ "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " .
+ "current occurrence as \"$words->{$word}\" in " .
+ "table \"$table_name\.\n";
+ }
+ $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config);
+ } else {
+ $cur_node->{"final"} = $words->{$word};
+ if ($args->{"checkduplicates"}) {
+ $cur_node->{"table_name"} = $table_name;
+ }
+ }
+ }
+}
+
+# Prompt user when no replacement has been found for a word
+# $word is the word that was not found and $context* the context,
+# with context*_orig being the original, non-transliterated context.
+# $table_paths is a mapping of table paths (here, only the keys, i.e.
+# the actual paths, are used) to allow the user to choose a table to
+# save a new replacement to.
+# $cur_lineno is a display string to show the current line number
+# $config_error is an optional flag to specify whether an error
+# message should be displayed, informing the user that the config
+# could not be loaded (used when "Reload config" is clicked)
+# Returns: an array reference containing an action to be taken,
+# in the form ["<action name>", <optional args>].
+# See `handle_unknown_word_action` for currently accepted values
+sub prompt_unknown_word {
+ my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_;
+ my $action;
+ my $stop = 0;
+
+ my $window = Gtk2::Window->new('toplevel');
+ $window->signal_connect(delete_event => sub {return FALSE});
+ $window->signal_connect(destroy => sub { Gtk2->main_quit; });
+ $window->set_border_width(10);
+
+ my $vbox = Gtk2::VBox->new(FALSE, 0);
+
+ my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
+ $vbox->pack_start($linelabel, FALSE, FALSE, 10);
+ $linelabel->show;
+
+ my $wordlabel = Gtk2::Label->new("Word not found: $word");
+ $wordlabel->set_alignment(0.0, 0.0);
+ $vbox->pack_start($wordlabel, FALSE, FALSE, 10);
+ $wordlabel->show;
+
+ # Make a text box with the given left and right context and label
+ # Also creates a button allowing the user to set the currently
+ # selected text as the word to be replaced - useful when only part
+ # of the entire word that was not found has to be replaced
+ my $make_context_box = sub {
+ my ($ctxtl, $ctxtr, $lbl) = @_;
+ my $hbox = Gtk2::HBox->new(FALSE, 0);
+ my $label = Gtk2::Label->new($lbl);
+ my $text = Gtk2::TextView->new;
+ my $buffer = $text->get_buffer();
+ $buffer->set_text($ctxtr);
+ my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
+ my $start = $buffer->get_start_iter();
+ $buffer->insert_with_tags($start, $word, $highlight);
+ $start = $buffer->get_start_iter();
+ $buffer->insert($start, $ctxtl);
+ my $button = Gtk2::Button->new("Use selection as word");
+ $button->signal_connect(
+ clicked => sub {
+ if (my ($start, $end) = $buffer->get_selection_bounds()) {
+ $word = $buffer->get_text($start, $end, FALSE);
+ $wordlabel->set_text("Selected: $word");
+ }
+ }, $window);
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $hbox->pack_start($text, FALSE, FALSE, 10);
+ $label->show;
+ $text->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 10);
+ $hbox->show;
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+ };
+ $make_context_box->($contextl, $contextr, "Context: ");
+ $make_context_box->($contextl_orig, $contextr_orig, "Original: ");
+
+ my $hbox = Gtk2::HBox->new(FALSE, 0);
+ my $label = Gtk2::Label->new("Ignore: ");
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $label->show;
+ my $button = Gtk2::Button->new("This run");
+ $button->signal_connect(
+ clicked => sub {
+ $action = ["ignore", "run", $word];
+ $window->destroy;
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $button = Gtk2::Button->new("Permanently");
+ $button->signal_connect(
+ clicked => sub {
+ $action = ["ignore", "permanent", $word];
+ $window->destroy;
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 10);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $label = Gtk2::Label->new("Add to list: ");
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $label->show;
+ my $path_list = Gtk2::ComboBox->new_text;
+ foreach my $path (keys %$table_paths) {
+ $path_list->append_text($path);
+ }
+ $hbox->pack_start($path_list, FALSE, FALSE, 0);
+ $path_list->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 10);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $label = Gtk2::Label->new("Replacement: ");
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $label->show;
+ my $replace_entry = Gtk2::Entry->new;
+ $hbox->pack_start($replace_entry, FALSE, FALSE, 0);
+ $replace_entry->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $button = Gtk2::Button->new("Add replacement");
+ $button->signal_connect(
+ clicked => sub {
+ if ($path_list->get_active != -1) {
+ $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text];
+ $window->destroy;
+ }
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $button = Gtk2::Button->new("Stop processing");
+ $button->signal_connect(
+ clicked => sub {
+ $stop = 1;
+ $window->destroy;
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ $button = Gtk2::Button->new("Reload config");
+ $button->signal_connect(
+ clicked => sub {
+ $action = ["reload"];
+ $window->destroy;
+ }, $window);
+ $hbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+
+ if ($config_error) {
+ $label = Gtk2::Label->new("Error loading config; see terminal output for details");
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $label->show;
+ }
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+
+ $window->add($vbox);
+ $vbox->show;
+ $window->show;
+ Gtk2->main;
+
+ if ($stop) {
+ die "Processing stopped at line $cur_lineno";
+ }
+ if (!$action) {
+ # This action isn't actually handled - it doesn't make much sense,
+ # but at least nothing breaks when the window is closed without
+ # selecting an action
+ $action = ["ignore", "once", $word];
+ }
+ return $action;
+}
+
+# Prompt the user when a word has multiple replacement options (separated by $)
+# $cur_lineno - display string to show the current line number
+sub prompt_choose_word {
+ my ($substrings, $cur_lineno, $config) = @_;
+
+ # make a list of all substrings that contain multiple word options
+ my @replacements;
+ foreach (0..$#$substrings) {
+ if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) {
+ # Format of the elements in @replacements:
+ # [<id of substrings in $substrings>, <replacement word>, <original string>]
+ push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]];
+ }
+ }
+ # no substrings have multiple options
+ return if (!@replacements);
+
+ my $stop = 0;
+ my $cur_replacement = 0;
+
+ my $window = Gtk2::Window->new('toplevel');
+ $window->signal_connect(delete_event => sub {return FALSE});
+ $window->signal_connect(destroy => sub { Gtk2->main_quit; });
+ $window->set_border_width(10);
+
+ my $vbox = Gtk2::VBox->new(FALSE, 0);
+
+ my $linelabel = Gtk2::Label->new("Current line: $cur_lineno");
+ $vbox->pack_start($linelabel, FALSE, FALSE, 0);
+ $linelabel->show;
+
+ my $wordlabel = Gtk2::Label->new("");
+ $wordlabel->set_alignment(0.0, 0.0);
+ $vbox->pack_start($wordlabel, FALSE, FALSE, 0);
+ $wordlabel->show;
+
+ my $undo = Gtk2::Button->new("Undo");
+ $vbox->pack_start($undo, FALSE, FALSE, 0);
+ $undo->show;
+ $undo->set_sensitive(FALSE);
+
+ my $button_vbox = Gtk2::VBox->new(FALSE, 0);
+ $vbox->pack_start($button_vbox, FALSE, FALSE, 0);
+ $button_vbox->show;
+
+ my $accept = Gtk2::Button->new("Accept changes?");
+ $vbox->pack_start($accept, FALSE, FALSE, 0);
+
+ my $hbox = Gtk2::HBox->new(FALSE, 0);
+ my $label = Gtk2::Label->new("Context: ");
+ my $text = Gtk2::TextView->new;
+ my $buffer = $text->get_buffer();
+ my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow");
+ $text->set_editable(FALSE);
+ $hbox->pack_start($label, FALSE, FALSE, 0);
+ $hbox->pack_start($text, FALSE, FALSE, 10);
+ $label->show;
+ $text->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 10);
+ $hbox->show;
+
+ $hbox = Gtk2::HBox->new(FALSE, 0);
+ my $stop_button = Gtk2::Button->new("Stop processing");
+ $hbox->pack_start($stop_button, FALSE, FALSE, 0);
+ $stop_button->show;
+ $vbox->pack_start($hbox, FALSE, FALSE, 0);
+ $hbox->show;
+
+ $window->add($vbox);
+ $vbox->show;
+ $window->show;
+
+ # generate the context to the left and to the right of the current word being replaced
+ my $get_context = sub {
+ my ($contextl, $contextr) = ("", "");
+ my $tmp_replacement = 0;
+ foreach (0..$#$substrings) {
+ my $word = $substrings->[$_]->[1];
+ if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) {
+ $word = $replacements[$tmp_replacement]->[1];
+ $tmp_replacement++;
+ }
+ # When nothing is left to replace, the entire string is in $contextl
+ if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) {
+ $contextl .= $word;
+ } elsif ($_ > $replacements[$cur_replacement]->[0]) {
+ $contextr .= $word;
+ }
+ }
+ return ($contextl, $contextr);
+ };
+
+ # fill the text buffer with the context and current word, highlighting the word
+ # if $cur_replacement is after the end of @replacements, don't highlight anythiing
+ # (this happens when all words have been replaced and the user only needs to accept the changes)
+ my $fill_text_buffer = sub {
+ my $start = $buffer->get_start_iter();
+ my $end = $buffer->get_end_iter();
+ $buffer->delete($start, $end);
+ my ($contextl, $contextr) = $get_context->();
+ $buffer->set_text($contextr);
+ if ($cur_replacement <= $#replacements) {
+ $start = $buffer->get_start_iter();
+ $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight);
+ }
+ $start = $buffer->get_start_iter();
+ $buffer->insert($start, $contextl);
+ };
+
+ # fill $button_vbox with the word options for the current word
+ my $fill_button_vbox;
+ $fill_button_vbox = sub {
+ $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
+ my $word = $replacements[$cur_replacement]->[1];
+ $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
+ my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1];
+ foreach my $word_choice (@choices) {
+ my $button = Gtk2::Button->new($word_choice);
+ $button->signal_connect(
+ clicked => sub {
+ $replacements[$cur_replacement]->[1] = $word_choice;
+ $undo->set_sensitive(TRUE);
+ $cur_replacement++;
+ $fill_text_buffer->();
+ if ($cur_replacement > $#replacements) {
+ $button_vbox->foreach(sub {my $child = shift; $child->destroy();});
+ $accept->show;
+ $accept->grab_focus;
+ $wordlabel->set_text("");
+ return;
+ }
+ $fill_button_vbox->();
+ }, $window);
+ $button_vbox->pack_start($button, FALSE, FALSE, 0);
+ $button->show;
+ }
+ };
+
+ $undo->signal_connect(
+ clicked => sub {
+ if ($cur_replacement > 0) {
+ $cur_replacement--;
+ if ($cur_replacement == 0) {
+ $undo->set_sensitive(FALSE);
+ }
+ $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2];
+ $fill_button_vbox->();
+ $fill_text_buffer->();
+ $accept->hide;
+ my $word = $replacements[$cur_replacement]->[1];
+ $wordlabel->set_text("Word \"$word\" has multiple replacement options:");
+ }
+ }, $window);
+
+ $accept->signal_connect(
+ clicked => sub {
+ # write the changes to the original $substrings
+ foreach (@replacements) {
+ $substrings->[$_->[0]]->[1] = $_->[1];
+ }
+ $window->destroy;
+ }, $window);
+
+ $stop_button->signal_connect(
+ clicked => sub {
+ $stop = 1;
+ $window->destroy;
+ }, $window);
+
+ $fill_button_vbox->();
+ $fill_text_buffer->();
+
+ Gtk2->main;
+ if ($stop) {
+ die "Processing stopped at line $cur_lineno";
+ }
+}
+
+my $ID = 0;
+my $STRING = 1;
+
+# Parse the configuration file into data type (currently only ID and STRING)
+sub parse_config {
+ my $f = shift;
+ my $fh;
+ if (!open($fh, "<", $f)) {
+ warn "Can't open config file \"$f\"!\n";
+ return undef;
+ }
+ my $line;
+ my @commands;
+ my $state = 0;
+ my $IN_ID = 1;
+ my $IN_STR = 2;
+ my $cur_val = "";
+ while ($line = <$fh>) {
+ chomp($line);
+ $state = 0;
+ push(@commands, []);
+ foreach my $char (split(//, $line)) {
+ if ($char eq "#") {
+ last;
+ } elsif ($char eq '"') {
+ if ($state & $IN_STR) {
+ push(@{$commands[-1]}, {type => $STRING, value => $cur_val});
+ $cur_val = "";
+ $state &= ~$IN_STR;
+ } else {
+ $cur_val = "";
+ $state |= $IN_STR;
+ }
+ } elsif ($char eq " ") {
+ if ($state & $IN_ID) {
+ push(@{$commands[-1]}, {type => $ID, value => $cur_val});
+ $state &= ~$IN_ID;
+ $cur_val = "";
+ } elsif ($state) {
+ $cur_val .= $char;
+ }
+ } else {
+ if (!$state) {
+ $state |= $IN_ID;
+ }
+ $cur_val .= $char;
+ }
+ }
+ if ($state & $IN_STR) {
+ warn "ERROR: Unterminated string in config:\n$line";
+ return undef;
+ } elsif ($cur_val) {
+ push(@{$commands[-1]}, {type => $ID, value => $cur_val});
+ $cur_val = "";
+ }
+ # FIXME: check if this works
+ if ($#{$commands[-1]} == -1) {
+ pop(@commands);
+ }
+ }
+ close($fh);
+
+ return \@commands;
+}
+
+# Load a file of replacement words into a hash table
+sub load_table {
+ my ($filename, $args, $config) = @_;
+ my $fh;
+ # if the paths are relative, find their absolute location based
+ # on the location of the config file
+ if (!file_name_is_absolute $filename) {
+ my $config_dir = dirname $args->{"config"};
+ $filename = rel2abs($filename, $config_dir);
+ }
+ if (!open($fh, "<", $filename)) {
+ warn "Can't open table file \"$filename\"!\n";
+ return undef;
+ }
+ my $line;
+ my @words;
+ my %table;
+ while ($line = <$fh>) {
+ chomp $line;
+ next if (!$line);
+ @words = split(/\Q$config->{tablesep}\E/, $line);
+ if (@words != 2) {
+ warn "ERROR: Malformed line in file \"$filename\":\n$line\n";
+ close $fh;
+ return undef;
+ }
+ my $key = NFD($words[0]);
+ if (exists $table{$key}) {
+ if ($args->{"checkduplicates"}) {
+ warn "WARNING: Duplicate word in file \"$filename\": " .
+ "\"$key\", with replacement \"$words[1]\", " .
+ "already exists with replacement \"$table{$key}\".\n";
+ }
+ $table{$key} = get_unique_words($table{$key} . $config->{choicesep} . $words[1], $config);
+ } else {
+ $table{$key} = $words[1];
+ }
+ }
+ close $fh;
+ return \%table;
+}
+
+# Load table for words to ignore - only the keys matter, since there is no replacement
+sub load_ignore_table {
+ my ($filename, $args) = @_;
+ my $line;
+ my %table;
+ if (!file_name_is_absolute $filename) {
+ my $config_dir = dirname $args->{"config"};
+ $filename = rel2abs($filename, $config_dir);
+ }
+ my $fh;
+ if (!open($fh, "<", $filename)) {
+ warn "Can't open ignore file \"$filename\"!\n";
+ return undef;;
+ }
+ while ($line = <$fh>) {
+ chomp $line;
+ if ($line) {
+ $table{NFD($line)} = "";
+ }
+ }
+ close($fh);
+ return \%table;
+}
+
+# Generate all forms of a word by combining it with endings
+# Returns:
+# 0 - an error occurred
+# 1 - everything's fine
+sub expand_table {
+ my ($cmd, $tables, $config) = @_;
+ my $table_name = $cmd->[1]->{"value"};
+ my $new_table_name = $cmd->[2]->{"value"};
+ my $forms_name = $cmd->[3]->{"value"};
+ if ($new_table_name eq "same") {
+ $new_table_name = $table_name;
+ }
+ if (!exists($tables->{$table_name})) {
+ warn "expand_table: table \"$table_name\" does not exist.\n";
+ return 0;
+ }
+ if (!exists($tables->{$forms_name})) {
+ warn "expand_table: table \"$forms_name\" does not exist.\n";
+ return 0;
+ }
+ my $table = $tables->{$table_name};
+ my $forms = $tables->{$forms_name};
+
+ my $noroot = 0;
+ if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") {
+ $noroot = 1;
+ }
+
+ my %new_table;
+ foreach my $word (keys %$table) {
+ foreach my $ending (keys %$forms) {
+ # Some words and/or endings have multiple options, separated by $config->{choicesep}
+ # These must be temporarily separated in order to properly generate the forms
+ my @word_options;
+ my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word});
+ my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending});
+ foreach my $stem_option (@stem_options) {
+ foreach my $ending_option (@ending_options) {
+ push(@word_options, $stem_option . $ending_option);
+ }
+ }
+ $new_table{$word . $ending} = join($config->{choicesep}, @word_options);
+ }
+ if (!$noroot) {
+ $new_table{$word} = $table->{$word};
+ }
+ }
+ $tables->{$new_table_name} = \%new_table;
+ return 1;
+}
+
+# Check if the number and types of arguments given to a config command are right
+# Returns:
+# 0 - an error occurred
+# 1 - everything's fine
+sub check_args {
+ my ($args, $cmd) = @_;
+ my $cmd_name = $cmd->[0]->{"value"};
+ if ($#$cmd - 1 < $#$args) {
+ my $err = "ERROR: not enough arguments for command \"$cmd_name\":";
+ foreach my $arg (@{$cmd}[1..$#$cmd]) {
+ $err .= " " . $arg->{"value"}
+ }
+ warn "$err\n";
+ return 0;
+ }
+ my $arg_num = 0;
+ while ($arg_num <= $#$args) {
+ if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) {
+ my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n";
+ $err .= "Expected:";
+ foreach my $arg_type (@$args) {
+ $err .= " ID" if ($arg_type == $ID);
+ $err .= " STRING" if ($arg_type == $STRING);
+ }
+ $err .= "\nReceived:";
+ foreach my $arg (@{$cmd}[1..$#$cmd]) {
+ $err .= " ID" if ($arg->{"type"} == $ID);
+ $err .= " STRING" if ($arg->{"type"} == $STRING);
+ }
+ warn "$err\n";
+ return 0;
+ }
+ $arg_num++;
+ }
+ return 1;
+}
+
+# Interpret the config file - load and expand tables, etc.
+# $config_list - the list returned by parse_config
+sub interpret_config {
+ my ($config_list, $args) = @_;
+ my %tables;
+ my %config;
+ $config{"table_paths"} = {};
+ $config{"ignore"} = "";
+ $config{"ignore_words"} = {};
+ $config{"split"} = "\\s";
+ $config{"beforeword"} = "\\s";
+ $config{"afterword"} = "\\s";
+ $config{"tablesep"} = "\t";
+ $config{"choicesep"} = "\$";
+ $config{"replacements"} = [];
+ my %tmp_table_paths;
+ my %mandatory_args = (
+ "ignore" => [$STRING],
+ "table" => [$ID],
+ "expand" => [$ID, $ID, $ID],
+ "match" => [$STRING, $STRING],
+ "matchignore" => [$STRING],
+ "replace" => [$ID],
+ "split" => [$STRING],
+ "beforeword" => [$STRING],
+ "afterword" => [$STRING],
+ "tablesep" => [$STRING],
+ "choicesep" => [$STRING],
+ "group" => [],
+ "endgroup" => []
+ );
+ my $in_group = 0;
+ foreach my $cmd (@$config_list) {
+ # All load statements must be before expand statements
+ # All expand, beforeword, and afterword statements must be before replace statements
+ if ($cmd->[0]->{"type"} == $ID) {
+ if (!exists($mandatory_args{$cmd->[0]->{"value"}})) {
+ warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n";
+ return undef;
+ }
+ if (!check_args($mandatory_args{$cmd->[0]->{"value"}}, $cmd)) {
+ return undef;
+ }
+ if ($cmd->[0]->{"value"} eq "table") {
+ my $table = load_table $cmd->[2]->{"value"}, $args, \%config;
+ if (!$table) {
+ return undef;
+ }
+ $tables{$cmd->[1]->{"value"}} = $table;
+ $tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "expand") {
+ # FIXME: need to handle table paths when a new table name is used for the expansion
+ if (!expand_table($cmd, \%tables, \%config)) {
+ return undef;
+ }
+ } elsif ($cmd->[0]->{"value"} eq "group") {
+ if ($in_group) {
+ warn "ERROR: New group started without ending last one in config\n";
+ return undef;
+ }
+ push @{$config{"replacements"}}, {"type" => "group", "words" => {}};
+ for (1..$#$cmd) {
+ $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ }
+ $in_group = 1;
+ } elsif ($cmd->[0]->{"value"} eq "endgroup") {
+ if (!$in_group) {
+ warn "ERROR: endgroup command called while not in group\n";
+ return undef;
+ }
+ $in_group = 0;
+ } elsif ($cmd->[0]->{"value"} eq "match") {
+ if ($in_group) {
+ warn "ERROR: match command is invalid inside group\n";
+ return undef;
+ }
+ push @{$config{"replacements"}}, {
+ "type" => "match",
+ "search" => NFD($cmd->[1]->{"value"}),
+ "replace" => $cmd->[2]->{"value"}};
+ for (3..$#$cmd) {
+ $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ }
+ } elsif ($cmd->[0]->{"value"} eq "matchignore") {
+ if ($in_group) {
+ warn "ERROR: matchignore command is invalid inside group\n";
+ return undef;
+ }
+ push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})};
+ for (2..$#$cmd) {
+ $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1;
+ }
+ } elsif ($cmd->[0]->{"value"} eq "replace") {
+ if (!$in_group) {
+ warn "ERROR: replace command called while not in group\n";
+ return undef;
+ }
+ my $table = $cmd->[1]->{"value"};
+ if (!exists($tables{$table})) {
+ warn "ERROR: nonexistent table \"$table\" in replace statement.\n";
+ return undef;
+ }
+ if (exists($tmp_table_paths{$table})) {
+ $config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table];
+ }
+ # Note: we don't need to check if $table{"choicesep"} was defined
+ # here since we can't ever get this far without first having
+ # loaded a table anyways
+ add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $tables{$table}, $args, \%config);
+ } elsif ($cmd->[0]->{"value"} eq "split") {
+ $config{"split"} = $cmd->[1]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "beforeword") {
+ $config{"beforeword"} = $cmd->[1]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "afterword") {
+ $config{"afterword"} = $cmd->[1]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "tablesep") {
+ $config{"tablesep"} = $cmd->[1]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "choicesep") {
+ $config{"choicesep"} = $cmd->[1]->{"value"};
+ } elsif ($cmd->[0]->{"value"} eq "ignore") {
+ $config{"ignore"} = $cmd->[1]->{"value"};
+ my $table = load_ignore_table $cmd->[1]->{"value"}, $args;
+ if (!$table) {
+ return undef;
+ }
+ $config{"ignore_words"} = $table;
+ } else {
+ warn "ERROR: unknown command \"" . $cmd->[0]->{"value"} . "\" in config.\n";
+ return undef;
+ }
+ } else {
+ my $err = "ERROR: line does not start with command:\n";
+ foreach my $cmd_part (@$cmd) {
+ $err .= $cmd_part->{"value"};
+ }
+ warn "$err\n";
+ return undef;
+ }
+ }
+ if ($in_group) {
+ warn "ERROR: unclosed group in config\n";
+ return undef;
+ }
+ if (!$config{"ignore"}) {
+ warn "ERROR: no file of words to ignore specified.\n";
+ return undef;
+ }
+ return \%config;
+}
+
+# load the config file
+# Returns:
+# the config hash or undef if an error occurred
+sub load_config {
+ my $args = shift;
+ my $config_list = parse_config($args->{"config"});
+ if (!$config_list) {
+ return undef;
+ }
+ return interpret_config $config_list, $args;
+}
+
+# Handle the action returned by `prompt_unknown_word`
+# $config - the current program config
+# $args - the command line arguments
+# Currently accepted values for $action:
+# ["ignore", "run", $word] - only ignore $word for the rest of this run
+# ["ignore", "permanent", $word] - write $word to the permanent ignore file
+# ["add", $word, $replace_word, $table_path] - add $word to the table
+# corresponding to $table_path with $replace_word as its replacement. Note that
+# only tables directly corresponding to paths work here - tables that only
+# were created through "expand" in the config aren't ever shown separately
+# in `prompt_unknown_word`
+# ["reload"] - reload the configuration file
+# Returns:
+# 0 - nothing needs to be done
+# 1 - the current line needs to be re-transliterated with the new config
+# 2 - an error occurred while reloading the config
+sub handle_unknown_word_action {
+ my ($action, $config, $args) = @_;
+ if ($action->[0] eq "ignore") {
+ $config->{"ignore_words"}->{$action->[2]} = "";
+ if ($action->[1] eq "permanent") {
+ my $fh;
+ if (!open($fh, ">>", $config->{"ignore"})) {
+ warn "ERROR: Can't open ignore file for appending.\n";
+ return 1;
+ }
+ print($fh $action->[2] . "\n");
+ close($fh);
+ } elsif ($action->[1] eq "run") {
+ # Print to error file if ignore isn't permanent
+ return 0 if ($args->{"errors"} eq "");
+ my $fh;
+ if (!open($fh, ">>", $args->{"errors"})) {
+ warn "ERROR: Can't open error file \"$args->{errors}\".\n";
+ return 0;
+ }
+ print($fh $action->[2] . "\n");
+ close($fh);
+ }
+ return 0;
+ } elsif ($action->[0] eq "add") {
+ my $table_path = $action->[3];
+ my $word = $action->[1];
+ my $replace_word = $action->[2];
+ my $fh;
+ if (!open($fh, ">>", $table_path)) {
+ warn "ERROR: Can't open table file \"$table_path\" for appending.\n";
+ return 1;
+ }
+ print($fh $word . "," . $replace_word . "\n");
+ close($fh);
+ my $replacement_id = $config->{"table_paths"}->{$table_path}->[0];
+ my $trie = $config->{"replacements"}->[$replacement_id]->{"words"};
+ add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word});
+ return 1;
+ } elsif ($action->[0] eq "reload") {
+ my $new_config = load_config $args;
+ if ($new_config) {
+ %$config = %$new_config;
+ return 1;
+ } else {
+ return 2;
+ }
+ }
+}
+
+# Split $substrings into single words based on the "split" option
+# in $config.
+# $substrings can already be split at this point; only the
+# ones that haven't been transliterated yet are modified
+sub split_words {
+ my ($config, $substrings) = @_;
+ my @substrings_new;
+ my $split_re = qr/($config->{"split"})/;
+ foreach my $cur_substr (@$substrings) {
+ if ($cur_substr->[0] == 1) {
+ push(@substrings_new, $cur_substr);
+ next;
+ }
+
+ my @words = split(/$split_re/, $cur_substr->[1]);
+ for my $i (0..$#words) {
+ # Word is not delimiter
+ # Split produces an empty field at the beginning if the string
+ # starts with the delimiter
+ if ($i % 2 == 0) {
+ push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne '');
+ } else {
+ # Delimiters can count as already replaced
+ push(@substrings_new, [1, $words[$i], $words[$i]]);
+ }
+ }
+ }
+ @$substrings = @substrings_new;
+}
+
+# small helper function to add a untransliterated string to the last substring
+# if that is not transliterated yet, or push it onto @$substrings otherwise
+# -> used to keep all untransliterated text in one piece
+# since this is also used for the "nofinal" attribute on "match", it takes
+# an original and replaced string (since, when using "match" and "nofinal",
+# the original string was replaced, but is still marked as unknown)
+sub push_unknown {
+ my ($substrings, $orig, $replaced) = @_;
+ $replaced //= $orig;
+ if (@$substrings && !$substrings->[-1]->[0]) {
+ $substrings->[-1]->[1] .= $replaced;
+ $substrings->[-1]->[2] .= $orig;
+ } else {
+ push(@$substrings, [0, $replaced, $orig]);
+ }
+}
+
+# Replace a word in $substrings based on $replace_config using regex
+# $replace_config->{"search"} is the word to replace
+# $replace_config->{"replace"} is the replacement word
+# if $replace_config->{"replace"} is undefined, just splits
+# $substrings at the the match and marks that the match has
+# been transliterated - currently used for "matchignore"
+# $replace_config->{"beginword"}, $replace_config->{"afterword"} -
+# specifies if the match is only valid when $config->{"beforeword"}
+# or $config->{"afterword"} occur before or after it, respectively
+sub replace_match {
+ my ($config, $replace_config, $substrings) = @_;
+ my $beginword = exists $replace_config->{"beginword"};
+ my $endword = exists $replace_config->{"endword"};
+ my $fullword = $beginword && $endword;
+ my $beforeword = $config->{"beforeword"};
+ my $afterword = $config->{"afterword"};
+ my $word = $replace_config->{"search"};
+ my $replace_word = $replace_config->{"replace"};
+ if ($fullword) {
+ $word = qr/(\A|$beforeword)$word(\z|$afterword)/;
+ } elsif ($beginword) {
+ $word = qr/(\A|$beforeword)$word/;
+ } elsif ($endword) {
+ $word = qr/$word(\z|$afterword)/;
+ } else {
+ $word = qr/$word/;
+ }
+
+ my @substrings_new;
+ # only modify $substrings at all if the word was found
+ my $found_word = 0;
+ my $last_idx;
+ # @substrings_new is only used if needed to improve efficiency
+ foreach my $i (0..$#$substrings) {
+ if ($substrings->[$i]->[0]) {
+ # FIXME: is there a way to make it more efficient by keeping the old array?
+ # This is a major bottleneck
+ # Note: the above statement *may* be a bit exaggerated
+ if ($found_word) {
+ push(@substrings_new, $substrings->[$i]);
+ }
+ next;
+ }
+ $last_idx = 0;
+ my $i0 = 0;
+ my $i1 = 0;
+ while ($substrings->[$i]->[1] =~ m/$word/g) {
+ if (!$found_word) {
+ $found_word = 1;
+ if ($i != 0) {
+ push(@substrings_new, @{$substrings}[0..$i-1]);
+ }
+ }
+ # This mess is needed to reliably match $beforeword and $afterword and put the captured
+ # "splitting" characters back into the text. This would be much easier just using
+ # a lookbehind and lookahead, but I couldn't find a way to also match beginning and
+ # end of string that way.
+ $i0 = $-[0];
+ $i1 = $+[0];
+ if ($fullword) {
+ $i0 += length($1);
+ $i1 -= length($2);
+ # pos need to be decreased so that matches still work right next to each other
+ pos($substrings->[$i]->[1]) -= length($2);
+ } elsif ($beginword) {
+ $i0 += length($1);
+ } elsif ($endword) {
+ $i1 -= length($1);
+ pos($substrings->[$i]->[1]) -= length($1);
+ }
+ if ($last_idx != $i0) {
+ my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx);
+ push_unknown \@substrings_new, $unknown;
+ }
+ my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0);
+ my $replace_str = $replace_word // $orig_str;
+ if ($replace_config->{"nofinal"}) {
+ push_unknown \@substrings_new, $orig_str, $replace_str;
+ } else {
+ push(@substrings_new, [1, $replace_str, $orig_str]);
+ }
+ $last_idx = $i1;
+ }
+ if ($last_idx < length($substrings->[$i]->[1]) && $found_word) {
+ my $unknown = substr($substrings->[$i]->[1], $last_idx);
+ push_unknown \@substrings_new, $unknown;
+ }
+ }
+ if ($found_word) {
+ @$substrings = @substrings_new;
+ }
+}
+
+# Replace a group, i.e. replace all the words in a trie
+# $replace_config->{"words"} - the root node of the trie
+# $replace_config->{"beginword"}, $replace_config->{"endword"} -
+# same as in `replace_match`
+sub replace_group {
+ my ($config, $replace_config, $substrings) = @_;
+ my @substrings_new;
+ # Recurse backwords towards the root node of the trie to find the first
+ # node with a key "final" which satisfies the ending condition (if "endword" is set)
+ # Returns the id *after* the last match and the node that was found
+ # with the key "final" (or undef, if nothing matched)
+ my $find_final = sub {
+ my ($i, $tmp_cur_node, $s) = @_;
+ do {
+ my $after_ch = substr($s->[1], $i, 1);
+ if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) ||
+ $after_ch eq "" || $after_ch =~ $config->{"afterword"})) {
+ return ($i, $tmp_cur_node);
+ }
+ $i--;
+ } while ($tmp_cur_node = $tmp_cur_node->{"parent"});
+ # none of the points were appropriate for breaking the word, so
+ # $tmp_cur_node now points to the nonexistent parent node of the
+ # root node
+ return ($i, undef);
+ };
+ foreach my $s (@$substrings) {
+ if ($s->[0]) {
+ push(@substrings_new, $s);
+ next;
+ }
+ my $cur_node = $replace_config->{"words"};
+ my $start_i = 0;
+ my $i = 0;
+ # This deliberately goes off the end of the string! $cur_node is always "one behind" $i
+ # since the node is only advanced in the iteration *after* $i has increased, meaning that
+ # $i has to already be after the end of the string for the first if statement to definitely
+ # fail, causing the elsif statement to handle that case
+ while ($i <= length($s->[1])) {
+ # This works even when $i is one index after the end of the string - it just returns "" then
+ my $ch = substr($s->[1], $i, 1);
+ if (exists $cur_node->{$ch}) {
+ if ($cur_node == $replace_config->{"words"}) {
+ my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : "";
+ if (exists($replace_config->{"beginword"}) &&
+ $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) {
+ push_unknown \@substrings_new, $ch;
+ $i++;
+ next;
+ }
+ $start_i = $i;
+ }
+ $cur_node = $cur_node->{$ch};
+ } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) {
+ my $tmp_cur_node = $cur_node;
+ ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s);
+ if (!defined($tmp_cur_node)) {
+ push_unknown \@substrings_new, substr($s->[1], $i + 1, 1);
+ $i += 2;
+ } else {
+ push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]);
+ }
+ $cur_node = $replace_config->{"words"};
+ next;
+ } else {
+ push_unknown \@substrings_new, $ch;
+ }
+ $i++;
+ }
+ }
+ @$substrings = @substrings_new;
+}
+
+# Perform all replacements on $line based on $config
+# $substrings: array of arrays - each one has three elements:
+# first 0 or 1, indicating if the substring following it has already
+# been replaced or not (1 means it has been replaced), then the
+# transliterated string, and lastly the original string.
+# If the first element is 0, the second two elements are obviously same
+sub replace_line {
+ my ($config, $line) = @_;
+ my $substrings = [[0, $line, $line]];
+ foreach my $replacement (@{$config->{"replacements"}}) {
+ if ($replacement->{"type"} eq "match") {
+ replace_match($config, $replacement, $substrings);
+ } elsif ($replacement->{"type"} eq "group") {
+ replace_group($config, $replacement, $substrings);
+ }
+ }
+ # splits all words at the end so that the splitting characters
+ # aren't taken as unknown words and the unknown words are (hopefully)
+ # in better chunks for prompting the user about them
+ split_words($config, $substrings);
+
+ return $substrings;
+}
+
+# NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST!
+# If an ignore word is added which is attached to a word that should have a replacement
+# added and just that word is selected to ignore, you never get a chance to add a
+# replacement for the other word that it is attached to
+
+# Handle unknown words
+# $substrings - the current substrings with unknown words
+# $config - the program config
+# $args - the command line args
+# $cur_lineno - display string to show the user the current line number
+# Returns:
+# 1 - the current line must be replaced again with the new config
+# 0 - everything's fine
+sub get_unknown_words {
+ my ($substrings, $config, $args, $cur_lineno) = @_;
+ foreach my $i (0..$#$substrings) {
+ my $word = $substrings->[$i];
+ if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) {
+ my $contextl = "";
+ my $contextl_orig = "";
+ foreach my $j (0..$i-1) {
+ $contextl .= $substrings->[$j]->[1];
+ $contextl_orig .= $substrings->[$j]->[2];
+ }
+ my $contextr = "";
+ my $contextr_orig = "";
+ foreach my $j ($i+1..$#$substrings) {
+ $contextr .= $substrings->[$j]->[1];
+ $contextr_orig .= $substrings->[$j]->[2];
+ }
+ my $action = prompt_unknown_word($contextl, $contextl_orig,
+ $word->[1], $contextr, $contextr_orig,
+ $config->{"table_paths"}, "$cur_lineno"
+ );
+ # if $ret == 2, config could not be loaded
+ # if $ret == 1, line must be redone with new config
+ my $ret = handle_unknown_word_action($action, $config, $args);
+ # keep retrying until the user chooses an action which
+ # didn't throw an error
+ while ($ret == 2) {
+ $action = prompt_unknown_word($contextl, $contextl_orig,
+ $word->[1], $contextr, $contextr_orig,
+ $config->{"table_paths"}, "$cur_lineno", 1);
+ $ret = handle_unknown_word_action($action, $config, $args);
+ }
+ # re-transliterate the line with the new config
+ if ($ret == 1) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+# Main replacement function
+# Opens the input file ($args->{"input"}) and writes the transliterated text
+# to the file handle $outputfh, prompting the user for unknown words or
+# word choices (if those aren't disabled on the command line)
+sub replace {
+ my ($config, $args, $outputfh) = @_;
+ # Is there *really* no more efficient way to get the total number of lines?
+ open my $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
+ my $total_lines = 0;
+ while (<$fh>) {$total_lines++};
+ close $fh;
+ open $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n";
+ while (my $line = <$fh>) {
+ next if $. < $args->{"start"};
+ my $nfd_line = NFD($line);
+ my $substrings = replace_line($config, $nfd_line);
+
+ if (!$args->{"nounknowns"}) {
+ # re-transliterate the string if the config was reloaded
+ while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) {
+ $substrings = replace_line($config, $nfd_line);
+ }
+ } elsif ($args->{"debug"}) {
+ foreach my $s (@$substrings) {
+ if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) {
+ warn "Unknown word: \"$s->[1]\"\n";
+ }
+ }
+ }
+ if (!$args->{"nochoices"}) {
+ prompt_choose_word($substrings, "$./$total_lines", $config);
+ } elsif ($args->{"debug"}) {
+ foreach my $s (@$substrings) {
+ if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) {
+ my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1];
+ warn "Word \"$s->[1]\" with $num_choices word choices.\n";
+ }
+ }
+ }
+
+ foreach (@$substrings) {
+ print $outputfh $_->[1];
+ }
+ }
+ close $fh;
+}
+
+my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => "");
+GetOptions(
+ \%args, "debug",
+ "nochoices", "nounknowns",
+ "force", "start=i",
+ "output=s", "config=s",
+ "errors=s", "help",
+ "checkduplicates") or pod2usage(1);
+
+pod2usage(-exitval => 0, -verbose => 2) if $args{"help"};
+pod2usage(1) if $#ARGV != 0 && !$args{"checkduplicates"};
+
+if (!-f $args{"config"}) {
+ die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n";
+}
+my $config = load_config \%args;
+if (!$config) {
+ die "ERROR: Invalid config\n";
+}
+exit 0 if ($args{"checkduplicates"});
+
+my $input = $ARGV[0];
+if (!-f $input) {
+ die "ERROR: input file \"$input\" does not exist or is not a file.\n";
+}
+$args{"input"} = $input;
+
+if (-f $args{"errors"} && !$args{"force"}) {
+ my $choice = "";
+ while ($choice !~ /^[yn]$/) {
+ print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? ";
+ $choice = <STDIN>;
+ chomp $choice;
+ }
+ die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y";
+}
+
+my $outputfh;
+if ($args{"output"} eq "") {
+ warn "WARNING: no output file supplied; printing to STDOUT\n";
+ open $outputfh, ">&STDOUT";
+} elsif (-f $args{"output"} && !$args{"force"}) {
+ my $choice = "";
+ while ($choice !~ /^[aoe]$/) {
+ print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? ";
+ $choice = <STDIN>;
+ chomp $choice;
+ }
+ if ($choice eq "a") {
+ open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+ } elsif ($choice eq "e") {
+ die "ERROR: \"$args{output}\" already exists.\n";
+ } else {
+ open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+ }
+} else {
+ open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n";
+}
+
+replace($config, \%args, $outputfh);
+close $outputfh;
+
+__END__
+
+=head1 NAME
+
+transliterate.pl - Transliterate text files
+
+=head1 SYNOPSIS
+
+transliterate.pl [options][input file]
+
+Start the transliteration engine with the given file as input.
+
+=head1 OPTIONS
+
+=over 8
+
+=item B<< --output <filename> >>
+
+Sets the output file to print to.
+
+If the file exists already and C<--force> is not set, the user is asked
+if the file should be overwritten or appended to.
+
+B<Default:> STDOUT (print to terminal)
+
+=item B<< --config <filename> >>
+
+Sets the configuration file to use.
+
+B<Default:> "config"
+
+=item B<--checkduplicates>
+
+Prints all duplicate words within single table files and across tables
+that are replaced within the same group, then exits the program.
+
+Note that this simply prints B<all> duplicates, even ones that are
+legitimate. When duplicates are found during normal operation of
+the program, they are simply combined in exactly the same way as the
+regular word choices.
+
+=item B<--nochoices>
+
+Disables prompting for the right word when multiple replacement words exist.
+
+This can be used to "weed out" all the unknown words before
+commencing the laborious task of choosing the right word every time
+multiple options exist.
+
+=item B<--nounknowns>
+
+Disables prompting for the right word when a word is not found in the database.
+
+This can be used together with B<--nochoices> to perform a quick test of how
+well the actual engine is working without having to click through all the
+prompts.
+
+=item B<--debug>
+
+This option is only useful for automatic testing of the transliteration engine.
+
+If C<--nochoices> is enabled, each word in the input with multiple choices will
+be output, along with the number of choices (can be used to test the proper
+functioning of C<choicesep> in the config file).
+
+If C<--nounknowns> is enabled, each unknown word in the input is printed
+(can be used to test that the C<ignore> options are working correctly).
+
+=item B<--force>
+
+Always overwrites the output and error file without asking.
+
+=item B<< --start <line number> >>
+
+Starts at the given line number instead of the beginning of the file.
+
+Note: when "Stop processing" is pressed, the current line number is
+printed out. This is the current line that was being processed, so it
+has not been printed to the output file yet and thus the program must
+be resumed at that line, not the one afterwards.
+
+=item B<< --errors <filename> >>
+
+Specifies a file to write errors in. Note that this does not refer to
+actual errors, but to any words that were temporarily ignored
+(i.e. words for which "Ignore: This run" was clicked).
+
+If no file is specified, nothing is written. If a file is specified
+that already exists and C<--force> is not set, the user is prompted
+for action.
+
+=item B<--help>
+
+Display the full documentation.
+
+=back
+
+=head1 DESCRIPTION
+
+B<transliterate.pl> will read the given input file and transliterate it
+based on the given configuration file, prompting the user for action if
+a word has multiple replacement options or is not found in the database.
+
+See L</"CONFIGURATION"> for details on what is possible.
+
+=head1 WORD CHOICE WINDOW
+
+The word choice window is opened any time one word has multiple replacement
+options and prompts the user to choose one.
+
+For each word with multiple options, the user must choose the right option
+and then press "Accept changes" to finalize the transliteration of the
+current line. Before the line is finalized, the user may press "Undo" to
+undo any changes on the current line.
+
+"Stop processing" will exit the program and print the line number that was
+currently being processed.
+
+=head1 UNKNOWN WORD WINDOW
+
+The unknown word window is opened any time a word could not be replaced.
+
+Both the context from the original language and the context from the
+transliterated version (so far) is shown. If a part of the text is
+selected in one of the text boxes and "Use selection as word" is
+pressed for the appropriate box, the selected text is used for the
+action that is taken subsequently.
+
+The possible actions are:
+
+=over 8
+
+=item Ignore
+
+"This run" only ignores the word until the program exist, while
+"Permanently" saves the word in the ignore file specified in the
+configuration.
+
+=item Add to list
+
+Adds the word typed in the text box beside "Add replacement" to the
+selected table file and re-runs the replacement on the current line.
+
+The filtering for which table files are actually shown here is
+currently a bit rudimentary. First, all paths that are used in the
+C<table> statements in the config are put into a list. Then, the
+paths corresponding to any tables used for word endings in the
+C<expand> statements are removed from the list, and that is what
+is shown in this window. The reason for removing those paths from
+the list is that it gets somewhat confusing when all the tables
+that are only used for word endings are also in the list, and it
+is very unlikely that an unknown word would need to be written to
+one of those files. If necessary, the word can always be added
+manually and the config reloaded. Note also that only actual
+table paths are shown here, not the tables themselves - this is
+not necessarily a one-to-one mapping since new tables can be
+generated with the C<expand> statements.
+
+=item Reload config
+
+Reload the configuration file along with all tables an re-runs the
+replacement on the current line. Note that this can take a short
+while since the entire word database has to be reloaded.
+
+=item Stop processing
+
+Prints the current line number to the terminal and exits the program.
+
+=back
+
+=head1 CONFIGURATION
+
+These are the commands accepted in the configuration file.
+Any parameters in square brackets are optional.
+
+The C<match>, C<matchignore>, and C<replace> commands are executed in
+the order they are specified, except that all C<replace> commands within
+the same group are replaced together.
+
+Note that any duplicate words found will cause the user to be prompted
+to choose one option every time the word is replaced in the input text.
+
+=over 8
+
+=item B<split> <regex string>
+
+Sets the RegEx string to be used for splitting words. This is only used
+for splitting the words which couldn't be replaced after all replacement
+has been done, before prompting the user for unknown words.
+
+Note that C<split> should probably always contain at least C<\n>, since
+otherwise all of the newlines will be marked as unknown words. Usually,
+this will be included anyways through C<\s>.
+
+Note also that C<split> should probably include the C<+> RegEx-quantifier
+since that allows the splitting function in the end to ignore several
+splitting characters right after each other (e.g. several spaces) in one
+go instead of splitting the string again for every single one of them.
+This shouldn't actually make any difference functionality-wise, though.
+
+B<Default:> \s (all whitespace)
+
+=item B<beforeword> <regex string>
+
+Sets the RegEx string to be matched before a word if C<beginword> is set.
+
+B<Default:> \s
+
+=item B<afterword> <regex string>
+
+Sets the RegEx string to be matched after a word if C<endword> is set.
+
+Note that C<afterword> should probably always contain at least C<\n>,
+since otherwise words with C<endword> set will not be matched at the
+end of a line.
+
+C<beforeword> and C<afterword> will often be exactly the same, but
+they are left as separate options in case more fine-tuning is needed.
+
+B<Default:> \s
+
+=item B<tablesep> <string>
+
+Sets the separator used to split the lines in the table files into the
+original and replacement word.
+
+B<Default:> Tab
+
+=item B<choicesep> <string>
+
+Sets the separator used to split replacement words into multiple choices for
+prompting the user.
+
+B<Default:> $
+
+=item B<ignore> <filename>
+
+Sets the file of words to ignore.
+
+This has to be set even if the file is just empty because the user can
+add words to it from the unknown word window.
+
+=item B<table> <table identifier> <filename>
+
+Load the table from C<< <filename> >>, making it available for later use in the
+C<expand> and C<replace> commands using the identifier C<< <table identifier> >>.
+
+Note that if C<< <filename> >> is not an absolute path, it is taken to be relative
+to the location of the configuration file.
+
+The table files simply consist of C<tablesep>-separated values, with the word in the
+original language first and the replacement word second. The replacement word
+can optionally have several parts separated by C<choicesep>, which will cause the
+user to be prompted to choose one of the options.
+
+=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot]
+
+Expand the table C<< <table identifier> >>, i.e. generate all the word forms using
+the word endings in C<< <word ending table> >>, saving the result as a table with the
+identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table
+identifier> >>, the original C<< <table identifier> >> is used instead.
+
+If C<noroot> is set, the root forms of the words are not kept.
+
+If the replacement for a word ending contains C<choicesep>, it is split and each part
+is combined with the root form separately and the user is prompted to choose one of
+the options later. it is thus possible to allow multiple choices for the ending if
+there is a distinction in the replacement language but not in the source language.
+Note that each of the root words is also split into its choices (if necessary)
+during the expanding, so it is possible to use C<choicesep> in both the endings
+and root words.
+
+Note that the paths of all tables used for <word ending table> are removed from the
+list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD
+WINDOW"> for details.
+
+=item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal]
+
+Perform a RegEx match using the given C<< <regex string> >>, replacing it with
+C<< <replacement string> >>. Note that the replacement cannot contain any RegEx
+(e.g. groups) in it. C<beginword> and C<endword> specify whether the match must
+be at the beginning or ending of a word, respectively, using the RegEx specified
+in C<beforeword> and C<afterword>. If C<nofinal> is set, the string is not marked
+as transliterated after the replacement, allowing it to be modified by subsequent
+C<match> or C<replace> commands.
+
+=item B<matchignore> <regex string> [beginword] [endword]
+
+Performs a RegEx match in the same manner as C<match>, except that the original
+match is used as the replacement instead of specifying a replacement string.
+
+=item B<group> [beginword] [endword]
+
+Begins a replacement group. All C<replace> commands must occur between C<group>
+and C<endgroup>, since they are then grouped together and replaced in one go.
+C<beginword> and C<endword> act in the same way as specified for C<match> and
+apply to all C<replace> statements in this group.
+
+=item B<replace> <table identifier>
+
+Replace all words in the table with the identifier C<< <table identifier> >>,
+using the C<beginword> and C<endword> settings specified by the current group.
+
+Note that a table must have been loaded (or generated using C<expand>)
+before being used in a C<replace> statement.
+
+=item B<endgroup>
+
+End a replacement group.
+
+=back
+
+=head1 SEE ALSO
+
+perlre, perlretut
+
+=head1 LICENSE
+
+Copyright (c) 2019, 2020 lumidify <nobody[at]lumidify.org>
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+=cut