transliterate

Transliteration engine
git clone git://lumidify.org/git/transliterate.git
Log | Files | Refs | README

commit 410d20484d0ceda89cfcad455b727bac02d071d9
Author: lumidify <nobody@lumidify.org>
Date:   Thu, 26 Mar 2020 07:37:37 +0100

Copy data from old repository

Diffstat:
A.gitignore | 1+
AREADME | 5+++++
Atests/alltests.sh | 4++++
Atests/runtest.sh | 13+++++++++++++
Atests/test1/config | 15+++++++++++++++
Atests/test1/descr.txt | 1+
Atests/test1/err.txt | 6++++++
Atests/test1/expected.txt | 6++++++
Atests/test1/input.txt | 6++++++
Atests/test2/config | 19+++++++++++++++++++
Atests/test2/descr.txt | 1+
Atests/test2/err.txt | 3+++
Atests/test2/expected.txt | 2++
Atests/test2/input.txt | 2++
Atransliterate.pl | 1605+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 1689 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +data diff --git a/README b/README @@ -0,0 +1,5 @@ +See the perldoc in transliterate.pl for documentation (run perldoc -F transliterate.pl). + +The git history on this repository is not complete because the original +repository contained a lot of private information that I didn't want to +make available here. diff --git a/tests/alltests.sh b/tests/alltests.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +./runtest.sh test1 +./runtest.sh test2 diff --git a/tests/runtest.sh b/tests/runtest.sh @@ -0,0 +1,13 @@ +#!/bin/sh + +../transliterate.pl --force --config "$1/config" --output runtest.txt --nounknowns --nochoices --debug "$1/input.txt" > runtest_err.txt 2>&1 +diff "$1/expected.txt" runtest.txt > /dev/null +err1=$? +diff "$1/err.txt" runtest_err.txt > /dev/null +if [ $? -eq 0 -a $err1 -eq 0 ]; then + echo "OK `cat $1/descr.txt`" +else + echo "FAILED `cat $1/descr.txt`" +fi +rm runtest.txt +rm runtest_err.txt diff --git a/tests/test1/config b/tests/test1/config @@ -0,0 +1,15 @@ +split "[ \n]" +beforeword " " +afterword "[ \n]" + +ignore "../data/ignore.txt" +table words "../data/words.txt" +table endings "../data/endings.txt" + +expand words same endings + +match "\d+" "num_replaced" beginword + +group beginword endword +replace words +endgroup diff --git a/tests/test1/descr.txt b/tests/test1/descr.txt @@ -0,0 +1 @@ +Basic test diff --git a/tests/test1/err.txt b/tests/test1/err.txt @@ -0,0 +1,6 @@ +Unknown word: "word20" +Unknown word: "word01231" +Word "word0_replaced$word0_replaced2" with 2 word choices. +Unknown word: "aword1" +Unknown word: "end3" +Word "word0_replacedend3_replaced$word0_replaced2end3_replaced" with 2 word choices. diff --git a/tests/test1/expected.txt b/tests/test1/expected.txt @@ -0,0 +1,6 @@ +word1_replaced word2_replaced +num_replacedword1_replaced word9_replaced num_replaced word4_replaced +word20 word01231 word0_replaced$word0_replaced2 +aword1 +word1_replacedend1_replaced word0_replacedend3_replaced$word0_replaced2end3_replaced end3 +num_replacedword2_replacedend1_replaced diff --git a/tests/test1/input.txt b/tests/test1/input.txt @@ -0,0 +1,6 @@ +word1 word2 +123word1 word9 123 word4 +word20 word01231 word0 +aword1 +word1end1 word0end3 end3 +432word2end1 diff --git a/tests/test2/config b/tests/test2/config @@ -0,0 +1,19 @@ +split "[ \n]+" +beforeword " " +afterword "[ \n]" +tablesep "," +choicesep "|" + +ignore "../data/ignore.txt" +table words "../data/words1.txt" +table endings "../data/endings_choices.txt" + +expand words other_name endings + +match "-\d" "-r" endword +match "w ord" "word" beginword nofinal +matchignore "\dhi\d" + +group beginword endword +replace other_name +endgroup diff --git a/tests/test2/descr.txt b/tests/test2/descr.txt @@ -0,0 +1 @@ +Different tablesep and linesep; match nofinal; ending choices diff --git a/tests/test2/err.txt b/tests/test2/err.txt @@ -0,0 +1,3 @@ +Word "word1_replacedend1r1|word1_replacedend1r2" with 2 word choices. +Unknown word: "4" +Word "word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2" with 4 word choices. diff --git a/tests/test2/expected.txt b/tests/test2/expected.txt @@ -0,0 +1,2 @@ +5hi3 word9_replaced-r word1_replacedend1r1|word1_replacedend1r2-r +45hi1 word0_replacedend1r1|word0_replacedend1r2|word0_replaced2end1r1|word0_replaced2end1r2 diff --git a/tests/test2/input.txt b/tests/test2/input.txt @@ -0,0 +1,2 @@ +5hi3 w ord9-0 w ord1end1-1 +45hi1 word0end1 diff --git a/transliterate.pl b/transliterate.pl @@ -0,0 +1,1605 @@ +#!/usr/bin/env perl + +# Proudly written using vi (OpenBSD nvi) + +# NOTE: If you're wondering why the error codes used by the functions are so +# inconsistent, go ask my former self + +use strict; +use warnings; +use utf8; +use feature 'unicode_strings'; +use open qw< :encoding(UTF-8) >; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +use Unicode::Normalize; +use Glib qw/TRUE FALSE/; +use Gtk2 '-init'; +use Getopt::Long; +use Pod::Usage; +use Scalar::Util qw(weaken); +use File::Basename qw(dirname); +use File::Spec::Functions qw(rel2abs file_name_is_absolute); + +# takes a string of words separated by '$' and returns a new string in the +# same format with duplicates removed +sub get_unique_words { + my ($word, $config) = @_; + my %tmp; + my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word; + return join $config->{choicesep}, @words_uniq; +} + +# Adds all words in $words to $trie +# Automatically combines duplicate words with "$" inbetween +sub add_to_trie { + my ($table_name, $trie, $words, $args, $config) = @_; + foreach my $word (keys %$words) { + my $cur_node = $trie; + foreach my $char (split //, $word) { + if (!exists($cur_node->{$char})) { + $cur_node->{$char}->{"parent"} = $cur_node; + # This is required to avoid circular references + # (otherwise, the garbage collector doesn't ever + # destroy these nodes, leading to the memory + # consumption growing without restraint if + # "Reload config" is used) + weaken($cur_node->{$char}->{"parent"}); + } + $cur_node = $cur_node->{$char}; + } + if (exists($cur_node->{"final"})) { + if ($args->{"checkduplicates"}) { + warn "WARNING: Duplicate word \"$word\". Last occurrence as " . + "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " . + "current occurrence as \"$words->{$word}\" in " . + "table \"$table_name\.\n"; + } + $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config); + } else { + $cur_node->{"final"} = $words->{$word}; + if ($args->{"checkduplicates"}) { + $cur_node->{"table_name"} = $table_name; + } + } + } +} + +# Prompt user when no replacement has been found for a word +# $word is the word that was not found and $context* the context, +# with context*_orig being the original, non-transliterated context. +# $table_paths is a mapping of table paths (here, only the keys, i.e. +# the actual paths, are used) to allow the user to choose a table to +# save a new replacement to. +# $cur_lineno is a display string to show the current line number +# $config_error is an optional flag to specify whether an error +# message should be displayed, informing the user that the config +# could not be loaded (used when "Reload config" is clicked) +# Returns: an array reference containing an action to be taken, +# in the form ["<action name>", <optional args>]. +# See `handle_unknown_word_action` for currently accepted values +sub prompt_unknown_word { + my ($contextl, $contextl_orig, $word, $contextr, $contextr_orig, $table_paths, $cur_lineno, $config_error) = @_; + my $action; + my $stop = 0; + + my $window = Gtk2::Window->new('toplevel'); + $window->signal_connect(delete_event => sub {return FALSE}); + $window->signal_connect(destroy => sub { Gtk2->main_quit; }); + $window->set_border_width(10); + + my $vbox = Gtk2::VBox->new(FALSE, 0); + + my $linelabel = Gtk2::Label->new("Current line: $cur_lineno"); + $vbox->pack_start($linelabel, FALSE, FALSE, 10); + $linelabel->show; + + my $wordlabel = Gtk2::Label->new("Word not found: $word"); + $wordlabel->set_alignment(0.0, 0.0); + $vbox->pack_start($wordlabel, FALSE, FALSE, 10); + $wordlabel->show; + + # Make a text box with the given left and right context and label + # Also creates a button allowing the user to set the currently + # selected text as the word to be replaced - useful when only part + # of the entire word that was not found has to be replaced + my $make_context_box = sub { + my ($ctxtl, $ctxtr, $lbl) = @_; + my $hbox = Gtk2::HBox->new(FALSE, 0); + my $label = Gtk2::Label->new($lbl); + my $text = Gtk2::TextView->new; + my $buffer = $text->get_buffer(); + $buffer->set_text($ctxtr); + my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow"); + my $start = $buffer->get_start_iter(); + $buffer->insert_with_tags($start, $word, $highlight); + $start = $buffer->get_start_iter(); + $buffer->insert($start, $ctxtl); + my $button = Gtk2::Button->new("Use selection as word"); + $button->signal_connect( + clicked => sub { + if (my ($start, $end) = $buffer->get_selection_bounds()) { + $word = $buffer->get_text($start, $end, FALSE); + $wordlabel->set_text("Selected: $word"); + } + }, $window); + $hbox->pack_start($label, FALSE, FALSE, 0); + $hbox->pack_start($text, FALSE, FALSE, 10); + $label->show; + $text->show; + $vbox->pack_start($hbox, FALSE, FALSE, 10); + $hbox->show; + $hbox = Gtk2::HBox->new(FALSE, 0); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + }; + $make_context_box->($contextl, $contextr, "Context: "); + $make_context_box->($contextl_orig, $contextr_orig, "Original: "); + + my $hbox = Gtk2::HBox->new(FALSE, 0); + my $label = Gtk2::Label->new("Ignore: "); + $hbox->pack_start($label, FALSE, FALSE, 0); + $label->show; + my $button = Gtk2::Button->new("This run"); + $button->signal_connect( + clicked => sub { + $action = ["ignore", "run", $word]; + $window->destroy; + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $button = Gtk2::Button->new("Permanently"); + $button->signal_connect( + clicked => sub { + $action = ["ignore", "permanent", $word]; + $window->destroy; + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $vbox->pack_start($hbox, FALSE, FALSE, 10); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + $label = Gtk2::Label->new("Add to list: "); + $hbox->pack_start($label, FALSE, FALSE, 0); + $label->show; + my $path_list = Gtk2::ComboBox->new_text; + foreach my $path (keys %$table_paths) { + $path_list->append_text($path); + } + $hbox->pack_start($path_list, FALSE, FALSE, 0); + $path_list->show; + $vbox->pack_start($hbox, FALSE, FALSE, 10); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + $label = Gtk2::Label->new("Replacement: "); + $hbox->pack_start($label, FALSE, FALSE, 0); + $label->show; + my $replace_entry = Gtk2::Entry->new; + $hbox->pack_start($replace_entry, FALSE, FALSE, 0); + $replace_entry->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + $button = Gtk2::Button->new("Add replacement"); + $button->signal_connect( + clicked => sub { + if ($path_list->get_active != -1) { + $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text]; + $window->destroy; + } + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + $button = Gtk2::Button->new("Stop processing"); + $button->signal_connect( + clicked => sub { + $stop = 1; + $window->destroy; + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + $button = Gtk2::Button->new("Reload config"); + $button->signal_connect( + clicked => sub { + $action = ["reload"]; + $window->destroy; + }, $window); + $hbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + + if ($config_error) { + $label = Gtk2::Label->new("Error loading config; see terminal output for details"); + $hbox->pack_start($label, FALSE, FALSE, 0); + $label->show; + } + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + + $window->add($vbox); + $vbox->show; + $window->show; + Gtk2->main; + + if ($stop) { + die "Processing stopped at line $cur_lineno"; + } + if (!$action) { + # This action isn't actually handled - it doesn't make much sense, + # but at least nothing breaks when the window is closed without + # selecting an action + $action = ["ignore", "once", $word]; + } + return $action; +} + +# Prompt the user when a word has multiple replacement options (separated by $) +# $cur_lineno - display string to show the current line number +sub prompt_choose_word { + my ($substrings, $cur_lineno, $config) = @_; + + # make a list of all substrings that contain multiple word options + my @replacements; + foreach (0..$#$substrings) { + if ($substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) { + # Format of the elements in @replacements: + # [<id of substrings in $substrings>, <replacement word>, <original string>] + push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]]; + } + } + # no substrings have multiple options + return if (!@replacements); + + my $stop = 0; + my $cur_replacement = 0; + + my $window = Gtk2::Window->new('toplevel'); + $window->signal_connect(delete_event => sub {return FALSE}); + $window->signal_connect(destroy => sub { Gtk2->main_quit; }); + $window->set_border_width(10); + + my $vbox = Gtk2::VBox->new(FALSE, 0); + + my $linelabel = Gtk2::Label->new("Current line: $cur_lineno"); + $vbox->pack_start($linelabel, FALSE, FALSE, 0); + $linelabel->show; + + my $wordlabel = Gtk2::Label->new(""); + $wordlabel->set_alignment(0.0, 0.0); + $vbox->pack_start($wordlabel, FALSE, FALSE, 0); + $wordlabel->show; + + my $undo = Gtk2::Button->new("Undo"); + $vbox->pack_start($undo, FALSE, FALSE, 0); + $undo->show; + $undo->set_sensitive(FALSE); + + my $button_vbox = Gtk2::VBox->new(FALSE, 0); + $vbox->pack_start($button_vbox, FALSE, FALSE, 0); + $button_vbox->show; + + my $accept = Gtk2::Button->new("Accept changes?"); + $vbox->pack_start($accept, FALSE, FALSE, 0); + + my $hbox = Gtk2::HBox->new(FALSE, 0); + my $label = Gtk2::Label->new("Context: "); + my $text = Gtk2::TextView->new; + my $buffer = $text->get_buffer(); + my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow"); + $text->set_editable(FALSE); + $hbox->pack_start($label, FALSE, FALSE, 0); + $hbox->pack_start($text, FALSE, FALSE, 10); + $label->show; + $text->show; + $vbox->pack_start($hbox, FALSE, FALSE, 10); + $hbox->show; + + $hbox = Gtk2::HBox->new(FALSE, 0); + my $stop_button = Gtk2::Button->new("Stop processing"); + $hbox->pack_start($stop_button, FALSE, FALSE, 0); + $stop_button->show; + $vbox->pack_start($hbox, FALSE, FALSE, 0); + $hbox->show; + + $window->add($vbox); + $vbox->show; + $window->show; + + # generate the context to the left and to the right of the current word being replaced + my $get_context = sub { + my ($contextl, $contextr) = ("", ""); + my $tmp_replacement = 0; + foreach (0..$#$substrings) { + my $word = $substrings->[$_]->[1]; + if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) { + $word = $replacements[$tmp_replacement]->[1]; + $tmp_replacement++; + } + # When nothing is left to replace, the entire string is in $contextl + if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) { + $contextl .= $word; + } elsif ($_ > $replacements[$cur_replacement]->[0]) { + $contextr .= $word; + } + } + return ($contextl, $contextr); + }; + + # fill the text buffer with the context and current word, highlighting the word + # if $cur_replacement is after the end of @replacements, don't highlight anythiing + # (this happens when all words have been replaced and the user only needs to accept the changes) + my $fill_text_buffer = sub { + my $start = $buffer->get_start_iter(); + my $end = $buffer->get_end_iter(); + $buffer->delete($start, $end); + my ($contextl, $contextr) = $get_context->(); + $buffer->set_text($contextr); + if ($cur_replacement <= $#replacements) { + $start = $buffer->get_start_iter(); + $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight); + } + $start = $buffer->get_start_iter(); + $buffer->insert($start, $contextl); + }; + + # fill $button_vbox with the word options for the current word + my $fill_button_vbox; + $fill_button_vbox = sub { + $button_vbox->foreach(sub {my $child = shift; $child->destroy();}); + my $word = $replacements[$cur_replacement]->[1]; + $wordlabel->set_text("Word \"$word\" has multiple replacement options:"); + my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1]; + foreach my $word_choice (@choices) { + my $button = Gtk2::Button->new($word_choice); + $button->signal_connect( + clicked => sub { + $replacements[$cur_replacement]->[1] = $word_choice; + $undo->set_sensitive(TRUE); + $cur_replacement++; + $fill_text_buffer->(); + if ($cur_replacement > $#replacements) { + $button_vbox->foreach(sub {my $child = shift; $child->destroy();}); + $accept->show; + $accept->grab_focus; + $wordlabel->set_text(""); + return; + } + $fill_button_vbox->(); + }, $window); + $button_vbox->pack_start($button, FALSE, FALSE, 0); + $button->show; + } + }; + + $undo->signal_connect( + clicked => sub { + if ($cur_replacement > 0) { + $cur_replacement--; + if ($cur_replacement == 0) { + $undo->set_sensitive(FALSE); + } + $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2]; + $fill_button_vbox->(); + $fill_text_buffer->(); + $accept->hide; + my $word = $replacements[$cur_replacement]->[1]; + $wordlabel->set_text("Word \"$word\" has multiple replacement options:"); + } + }, $window); + + $accept->signal_connect( + clicked => sub { + # write the changes to the original $substrings + foreach (@replacements) { + $substrings->[$_->[0]]->[1] = $_->[1]; + } + $window->destroy; + }, $window); + + $stop_button->signal_connect( + clicked => sub { + $stop = 1; + $window->destroy; + }, $window); + + $fill_button_vbox->(); + $fill_text_buffer->(); + + Gtk2->main; + if ($stop) { + die "Processing stopped at line $cur_lineno"; + } +} + +my $ID = 0; +my $STRING = 1; + +# Parse the configuration file into data type (currently only ID and STRING) +sub parse_config { + my $f = shift; + my $fh; + if (!open($fh, "<", $f)) { + warn "Can't open config file \"$f\"!\n"; + return undef; + } + my $line; + my @commands; + my $state = 0; + my $IN_ID = 1; + my $IN_STR = 2; + my $cur_val = ""; + while ($line = <$fh>) { + chomp($line); + $state = 0; + push(@commands, []); + foreach my $char (split(//, $line)) { + if ($char eq "#") { + last; + } elsif ($char eq '"') { + if ($state & $IN_STR) { + push(@{$commands[-1]}, {type => $STRING, value => $cur_val}); + $cur_val = ""; + $state &= ~$IN_STR; + } else { + $cur_val = ""; + $state |= $IN_STR; + } + } elsif ($char eq " ") { + if ($state & $IN_ID) { + push(@{$commands[-1]}, {type => $ID, value => $cur_val}); + $state &= ~$IN_ID; + $cur_val = ""; + } elsif ($state) { + $cur_val .= $char; + } + } else { + if (!$state) { + $state |= $IN_ID; + } + $cur_val .= $char; + } + } + if ($state & $IN_STR) { + warn "ERROR: Unterminated string in config:\n$line"; + return undef; + } elsif ($cur_val) { + push(@{$commands[-1]}, {type => $ID, value => $cur_val}); + $cur_val = ""; + } + # FIXME: check if this works + if ($#{$commands[-1]} == -1) { + pop(@commands); + } + } + close($fh); + + return \@commands; +} + +# Load a file of replacement words into a hash table +sub load_table { + my ($filename, $args, $config) = @_; + my $fh; + # if the paths are relative, find their absolute location based + # on the location of the config file + if (!file_name_is_absolute $filename) { + my $config_dir = dirname $args->{"config"}; + $filename = rel2abs($filename, $config_dir); + } + if (!open($fh, "<", $filename)) { + warn "Can't open table file \"$filename\"!\n"; + return undef; + } + my $line; + my @words; + my %table; + while ($line = <$fh>) { + chomp $line; + next if (!$line); + @words = split(/\Q$config->{tablesep}\E/, $line); + if (@words != 2) { + warn "ERROR: Malformed line in file \"$filename\":\n$line\n"; + close $fh; + return undef; + } + my $key = NFD($words[0]); + if (exists $table{$key}) { + if ($args->{"checkduplicates"}) { + warn "WARNING: Duplicate word in file \"$filename\": " . + "\"$key\", with replacement \"$words[1]\", " . + "already exists with replacement \"$table{$key}\".\n"; + } + $table{$key} = get_unique_words($table{$key} . $config->{choicesep} . $words[1], $config); + } else { + $table{$key} = $words[1]; + } + } + close $fh; + return \%table; +} + +# Load table for words to ignore - only the keys matter, since there is no replacement +sub load_ignore_table { + my ($filename, $args) = @_; + my $line; + my %table; + if (!file_name_is_absolute $filename) { + my $config_dir = dirname $args->{"config"}; + $filename = rel2abs($filename, $config_dir); + } + my $fh; + if (!open($fh, "<", $filename)) { + warn "Can't open ignore file \"$filename\"!\n"; + return undef;; + } + while ($line = <$fh>) { + chomp $line; + if ($line) { + $table{NFD($line)} = ""; + } + } + close($fh); + return \%table; +} + +# Generate all forms of a word by combining it with endings +# Returns: +# 0 - an error occurred +# 1 - everything's fine +sub expand_table { + my ($cmd, $tables, $config) = @_; + my $table_name = $cmd->[1]->{"value"}; + my $new_table_name = $cmd->[2]->{"value"}; + my $forms_name = $cmd->[3]->{"value"}; + if ($new_table_name eq "same") { + $new_table_name = $table_name; + } + if (!exists($tables->{$table_name})) { + warn "expand_table: table \"$table_name\" does not exist.\n"; + return 0; + } + if (!exists($tables->{$forms_name})) { + warn "expand_table: table \"$forms_name\" does not exist.\n"; + return 0; + } + my $table = $tables->{$table_name}; + my $forms = $tables->{$forms_name}; + + my $noroot = 0; + if ($#$cmd >= 4 && $cmd->[4]->{"value"} eq "noroot") { + $noroot = 1; + } + + my %new_table; + foreach my $word (keys %$table) { + foreach my $ending (keys %$forms) { + # Some words and/or endings have multiple options, separated by $config->{choicesep} + # These must be temporarily separated in order to properly generate the forms + my @word_options; + my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word}); + my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending}); + foreach my $stem_option (@stem_options) { + foreach my $ending_option (@ending_options) { + push(@word_options, $stem_option . $ending_option); + } + } + $new_table{$word . $ending} = join($config->{choicesep}, @word_options); + } + if (!$noroot) { + $new_table{$word} = $table->{$word}; + } + } + $tables->{$new_table_name} = \%new_table; + return 1; +} + +# Check if the number and types of arguments given to a config command are right +# Returns: +# 0 - an error occurred +# 1 - everything's fine +sub check_args { + my ($args, $cmd) = @_; + my $cmd_name = $cmd->[0]->{"value"}; + if ($#$cmd - 1 < $#$args) { + my $err = "ERROR: not enough arguments for command \"$cmd_name\":"; + foreach my $arg (@{$cmd}[1..$#$cmd]) { + $err .= " " . $arg->{"value"} + } + warn "$err\n"; + return 0; + } + my $arg_num = 0; + while ($arg_num <= $#$args) { + if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) { + my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n"; + $err .= "Expected:"; + foreach my $arg_type (@$args) { + $err .= " ID" if ($arg_type == $ID); + $err .= " STRING" if ($arg_type == $STRING); + } + $err .= "\nReceived:"; + foreach my $arg (@{$cmd}[1..$#$cmd]) { + $err .= " ID" if ($arg->{"type"} == $ID); + $err .= " STRING" if ($arg->{"type"} == $STRING); + } + warn "$err\n"; + return 0; + } + $arg_num++; + } + return 1; +} + +# Interpret the config file - load and expand tables, etc. +# $config_list - the list returned by parse_config +sub interpret_config { + my ($config_list, $args) = @_; + my %tables; + my %config; + $config{"table_paths"} = {}; + $config{"ignore"} = ""; + $config{"ignore_words"} = {}; + $config{"split"} = "\\s"; + $config{"beforeword"} = "\\s"; + $config{"afterword"} = "\\s"; + $config{"tablesep"} = "\t"; + $config{"choicesep"} = "\$"; + $config{"replacements"} = []; + my %tmp_table_paths; + my %mandatory_args = ( + "ignore" => [$STRING], + "table" => [$ID], + "expand" => [$ID, $ID, $ID], + "match" => [$STRING, $STRING], + "matchignore" => [$STRING], + "replace" => [$ID], + "split" => [$STRING], + "beforeword" => [$STRING], + "afterword" => [$STRING], + "tablesep" => [$STRING], + "choicesep" => [$STRING], + "group" => [], + "endgroup" => [] + ); + my $in_group = 0; + foreach my $cmd (@$config_list) { + # All load statements must be before expand statements + # All expand, beforeword, and afterword statements must be before replace statements + if ($cmd->[0]->{"type"} == $ID) { + if (!exists($mandatory_args{$cmd->[0]->{"value"}})) { + warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n"; + return undef; + } + if (!check_args($mandatory_args{$cmd->[0]->{"value"}}, $cmd)) { + return undef; + } + if ($cmd->[0]->{"value"} eq "table") { + my $table = load_table $cmd->[2]->{"value"}, $args, \%config; + if (!$table) { + return undef; + } + $tables{$cmd->[1]->{"value"}} = $table; + $tmp_table_paths{$cmd->[1]->{"value"}} = $cmd->[2]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "expand") { + # FIXME: need to handle table paths when a new table name is used for the expansion + if (!expand_table($cmd, \%tables, \%config)) { + return undef; + } + } elsif ($cmd->[0]->{"value"} eq "group") { + if ($in_group) { + warn "ERROR: New group started without ending last one in config\n"; + return undef; + } + push @{$config{"replacements"}}, {"type" => "group", "words" => {}}; + for (1..$#$cmd) { + $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + } + $in_group = 1; + } elsif ($cmd->[0]->{"value"} eq "endgroup") { + if (!$in_group) { + warn "ERROR: endgroup command called while not in group\n"; + return undef; + } + $in_group = 0; + } elsif ($cmd->[0]->{"value"} eq "match") { + if ($in_group) { + warn "ERROR: match command is invalid inside group\n"; + return undef; + } + push @{$config{"replacements"}}, { + "type" => "match", + "search" => NFD($cmd->[1]->{"value"}), + "replace" => $cmd->[2]->{"value"}}; + for (3..$#$cmd) { + $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + } + } elsif ($cmd->[0]->{"value"} eq "matchignore") { + if ($in_group) { + warn "ERROR: matchignore command is invalid inside group\n"; + return undef; + } + push @{$config{"replacements"}}, {"type" => "match", "search" => NFD($cmd->[1]->{"value"})}; + for (2..$#$cmd) { + $config{"replacements"}->[-1]->{$cmd->[$_]->{"value"}} = 1; + } + } elsif ($cmd->[0]->{"value"} eq "replace") { + if (!$in_group) { + warn "ERROR: replace command called while not in group\n"; + return undef; + } + my $table = $cmd->[1]->{"value"}; + if (!exists($tables{$table})) { + warn "ERROR: nonexistent table \"$table\" in replace statement.\n"; + return undef; + } + if (exists($tmp_table_paths{$table})) { + $config{"table_paths"}->{$tmp_table_paths{$table}} = [$#{$config{"replacements"}}, $table]; + } + # Note: we don't need to check if $table{"choicesep"} was defined + # here since we can't ever get this far without first having + # loaded a table anyways + add_to_trie($table, $config{"replacements"}->[-1]->{"words"}, $tables{$table}, $args, \%config); + } elsif ($cmd->[0]->{"value"} eq "split") { + $config{"split"} = $cmd->[1]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "beforeword") { + $config{"beforeword"} = $cmd->[1]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "afterword") { + $config{"afterword"} = $cmd->[1]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "tablesep") { + $config{"tablesep"} = $cmd->[1]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "choicesep") { + $config{"choicesep"} = $cmd->[1]->{"value"}; + } elsif ($cmd->[0]->{"value"} eq "ignore") { + $config{"ignore"} = $cmd->[1]->{"value"}; + my $table = load_ignore_table $cmd->[1]->{"value"}, $args; + if (!$table) { + return undef; + } + $config{"ignore_words"} = $table; + } else { + warn "ERROR: unknown command \"" . $cmd->[0]->{"value"} . "\" in config.\n"; + return undef; + } + } else { + my $err = "ERROR: line does not start with command:\n"; + foreach my $cmd_part (@$cmd) { + $err .= $cmd_part->{"value"}; + } + warn "$err\n"; + return undef; + } + } + if ($in_group) { + warn "ERROR: unclosed group in config\n"; + return undef; + } + if (!$config{"ignore"}) { + warn "ERROR: no file of words to ignore specified.\n"; + return undef; + } + return \%config; +} + +# load the config file +# Returns: +# the config hash or undef if an error occurred +sub load_config { + my $args = shift; + my $config_list = parse_config($args->{"config"}); + if (!$config_list) { + return undef; + } + return interpret_config $config_list, $args; +} + +# Handle the action returned by `prompt_unknown_word` +# $config - the current program config +# $args - the command line arguments +# Currently accepted values for $action: +# ["ignore", "run", $word] - only ignore $word for the rest of this run +# ["ignore", "permanent", $word] - write $word to the permanent ignore file +# ["add", $word, $replace_word, $table_path] - add $word to the table +# corresponding to $table_path with $replace_word as its replacement. Note that +# only tables directly corresponding to paths work here - tables that only +# were created through "expand" in the config aren't ever shown separately +# in `prompt_unknown_word` +# ["reload"] - reload the configuration file +# Returns: +# 0 - nothing needs to be done +# 1 - the current line needs to be re-transliterated with the new config +# 2 - an error occurred while reloading the config +sub handle_unknown_word_action { + my ($action, $config, $args) = @_; + if ($action->[0] eq "ignore") { + $config->{"ignore_words"}->{$action->[2]} = ""; + if ($action->[1] eq "permanent") { + my $fh; + if (!open($fh, ">>", $config->{"ignore"})) { + warn "ERROR: Can't open ignore file for appending.\n"; + return 1; + } + print($fh $action->[2] . "\n"); + close($fh); + } elsif ($action->[1] eq "run") { + # Print to error file if ignore isn't permanent + return 0 if ($args->{"errors"} eq ""); + my $fh; + if (!open($fh, ">>", $args->{"errors"})) { + warn "ERROR: Can't open error file \"$args->{errors}\".\n"; + return 0; + } + print($fh $action->[2] . "\n"); + close($fh); + } + return 0; + } elsif ($action->[0] eq "add") { + my $table_path = $action->[3]; + my $word = $action->[1]; + my $replace_word = $action->[2]; + my $fh; + if (!open($fh, ">>", $table_path)) { + warn "ERROR: Can't open table file \"$table_path\" for appending.\n"; + return 1; + } + print($fh $word . "," . $replace_word . "\n"); + close($fh); + my $replacement_id = $config->{"table_paths"}->{$table_path}->[0]; + my $trie = $config->{"replacements"}->[$replacement_id]->{"words"}; + add_to_trie($config->{"table_paths"}->{$table_path}->[1], $trie, {$word => $replace_word}); + return 1; + } elsif ($action->[0] eq "reload") { + my $new_config = load_config $args; + if ($new_config) { + %$config = %$new_config; + return 1; + } else { + return 2; + } + } +} + +# Split $substrings into single words based on the "split" option +# in $config. +# $substrings can already be split at this point; only the +# ones that haven't been transliterated yet are modified +sub split_words { + my ($config, $substrings) = @_; + my @substrings_new; + my $split_re = qr/($config->{"split"})/; + foreach my $cur_substr (@$substrings) { + if ($cur_substr->[0] == 1) { + push(@substrings_new, $cur_substr); + next; + } + + my @words = split(/$split_re/, $cur_substr->[1]); + for my $i (0..$#words) { + # Word is not delimiter + # Split produces an empty field at the beginning if the string + # starts with the delimiter + if ($i % 2 == 0) { + push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne ''); + } else { + # Delimiters can count as already replaced + push(@substrings_new, [1, $words[$i], $words[$i]]); + } + } + } + @$substrings = @substrings_new; +} + +# small helper function to add a untransliterated string to the last substring +# if that is not transliterated yet, or push it onto @$substrings otherwise +# -> used to keep all untransliterated text in one piece +# since this is also used for the "nofinal" attribute on "match", it takes +# an original and replaced string (since, when using "match" and "nofinal", +# the original string was replaced, but is still marked as unknown) +sub push_unknown { + my ($substrings, $orig, $replaced) = @_; + $replaced //= $orig; + if (@$substrings && !$substrings->[-1]->[0]) { + $substrings->[-1]->[1] .= $replaced; + $substrings->[-1]->[2] .= $orig; + } else { + push(@$substrings, [0, $replaced, $orig]); + } +} + +# Replace a word in $substrings based on $replace_config using regex +# $replace_config->{"search"} is the word to replace +# $replace_config->{"replace"} is the replacement word +# if $replace_config->{"replace"} is undefined, just splits +# $substrings at the the match and marks that the match has +# been transliterated - currently used for "matchignore" +# $replace_config->{"beginword"}, $replace_config->{"afterword"} - +# specifies if the match is only valid when $config->{"beforeword"} +# or $config->{"afterword"} occur before or after it, respectively +sub replace_match { + my ($config, $replace_config, $substrings) = @_; + my $beginword = exists $replace_config->{"beginword"}; + my $endword = exists $replace_config->{"endword"}; + my $fullword = $beginword && $endword; + my $beforeword = $config->{"beforeword"}; + my $afterword = $config->{"afterword"}; + my $word = $replace_config->{"search"}; + my $replace_word = $replace_config->{"replace"}; + if ($fullword) { + $word = qr/(\A|$beforeword)$word(\z|$afterword)/; + } elsif ($beginword) { + $word = qr/(\A|$beforeword)$word/; + } elsif ($endword) { + $word = qr/$word(\z|$afterword)/; + } else { + $word = qr/$word/; + } + + my @substrings_new; + # only modify $substrings at all if the word was found + my $found_word = 0; + my $last_idx; + # @substrings_new is only used if needed to improve efficiency + foreach my $i (0..$#$substrings) { + if ($substrings->[$i]->[0]) { + # FIXME: is there a way to make it more efficient by keeping the old array? + # This is a major bottleneck + # Note: the above statement *may* be a bit exaggerated + if ($found_word) { + push(@substrings_new, $substrings->[$i]); + } + next; + } + $last_idx = 0; + my $i0 = 0; + my $i1 = 0; + while ($substrings->[$i]->[1] =~ m/$word/g) { + if (!$found_word) { + $found_word = 1; + if ($i != 0) { + push(@substrings_new, @{$substrings}[0..$i-1]); + } + } + # This mess is needed to reliably match $beforeword and $afterword and put the captured + # "splitting" characters back into the text. This would be much easier just using + # a lookbehind and lookahead, but I couldn't find a way to also match beginning and + # end of string that way. + $i0 = $-[0]; + $i1 = $+[0]; + if ($fullword) { + $i0 += length($1); + $i1 -= length($2); + # pos need to be decreased so that matches still work right next to each other + pos($substrings->[$i]->[1]) -= length($2); + } elsif ($beginword) { + $i0 += length($1); + } elsif ($endword) { + $i1 -= length($1); + pos($substrings->[$i]->[1]) -= length($1); + } + if ($last_idx != $i0) { + my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx); + push_unknown \@substrings_new, $unknown; + } + my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0); + my $replace_str = $replace_word // $orig_str; + if ($replace_config->{"nofinal"}) { + push_unknown \@substrings_new, $orig_str, $replace_str; + } else { + push(@substrings_new, [1, $replace_str, $orig_str]); + } + $last_idx = $i1; + } + if ($last_idx < length($substrings->[$i]->[1]) && $found_word) { + my $unknown = substr($substrings->[$i]->[1], $last_idx); + push_unknown \@substrings_new, $unknown; + } + } + if ($found_word) { + @$substrings = @substrings_new; + } +} + +# Replace a group, i.e. replace all the words in a trie +# $replace_config->{"words"} - the root node of the trie +# $replace_config->{"beginword"}, $replace_config->{"endword"} - +# same as in `replace_match` +sub replace_group { + my ($config, $replace_config, $substrings) = @_; + my @substrings_new; + # Recurse backwords towards the root node of the trie to find the first + # node with a key "final" which satisfies the ending condition (if "endword" is set) + # Returns the id *after* the last match and the node that was found + # with the key "final" (or undef, if nothing matched) + my $find_final = sub { + my ($i, $tmp_cur_node, $s) = @_; + do { + my $after_ch = substr($s->[1], $i, 1); + if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"endword"}) || + $after_ch eq "" || $after_ch =~ $config->{"afterword"})) { + return ($i, $tmp_cur_node); + } + $i--; + } while ($tmp_cur_node = $tmp_cur_node->{"parent"}); + # none of the points were appropriate for breaking the word, so + # $tmp_cur_node now points to the nonexistent parent node of the + # root node + return ($i, undef); + }; + foreach my $s (@$substrings) { + if ($s->[0]) { + push(@substrings_new, $s); + next; + } + my $cur_node = $replace_config->{"words"}; + my $start_i = 0; + my $i = 0; + # This deliberately goes off the end of the string! $cur_node is always "one behind" $i + # since the node is only advanced in the iteration *after* $i has increased, meaning that + # $i has to already be after the end of the string for the first if statement to definitely + # fail, causing the elsif statement to handle that case + while ($i <= length($s->[1])) { + # This works even when $i is one index after the end of the string - it just returns "" then + my $ch = substr($s->[1], $i, 1); + if (exists $cur_node->{$ch}) { + if ($cur_node == $replace_config->{"words"}) { + my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : ""; + if (exists($replace_config->{"beginword"}) && + $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) { + push_unknown \@substrings_new, $ch; + $i++; + next; + } + $start_i = $i; + } + $cur_node = $cur_node->{$ch}; + } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) { + my $tmp_cur_node = $cur_node; + ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s); + if (!defined($tmp_cur_node)) { + push_unknown \@substrings_new, substr($s->[1], $i + 1, 1); + $i += 2; + } else { + push(@substrings_new, [1, $tmp_cur_node->{"final"}, substr($s->[1], $start_i, $i-$start_i)]); + } + $cur_node = $replace_config->{"words"}; + next; + } else { + push_unknown \@substrings_new, $ch; + } + $i++; + } + } + @$substrings = @substrings_new; +} + +# Perform all replacements on $line based on $config +# $substrings: array of arrays - each one has three elements: +# first 0 or 1, indicating if the substring following it has already +# been replaced or not (1 means it has been replaced), then the +# transliterated string, and lastly the original string. +# If the first element is 0, the second two elements are obviously same +sub replace_line { + my ($config, $line) = @_; + my $substrings = [[0, $line, $line]]; + foreach my $replacement (@{$config->{"replacements"}}) { + if ($replacement->{"type"} eq "match") { + replace_match($config, $replacement, $substrings); + } elsif ($replacement->{"type"} eq "group") { + replace_group($config, $replacement, $substrings); + } + } + # splits all words at the end so that the splitting characters + # aren't taken as unknown words and the unknown words are (hopefully) + # in better chunks for prompting the user about them + split_words($config, $substrings); + + return $substrings; +} + +# NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST! +# If an ignore word is added which is attached to a word that should have a replacement +# added and just that word is selected to ignore, you never get a chance to add a +# replacement for the other word that it is attached to + +# Handle unknown words +# $substrings - the current substrings with unknown words +# $config - the program config +# $args - the command line args +# $cur_lineno - display string to show the user the current line number +# Returns: +# 1 - the current line must be replaced again with the new config +# 0 - everything's fine +sub get_unknown_words { + my ($substrings, $config, $args, $cur_lineno) = @_; + foreach my $i (0..$#$substrings) { + my $word = $substrings->[$i]; + if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) { + my $contextl = ""; + my $contextl_orig = ""; + foreach my $j (0..$i-1) { + $contextl .= $substrings->[$j]->[1]; + $contextl_orig .= $substrings->[$j]->[2]; + } + my $contextr = ""; + my $contextr_orig = ""; + foreach my $j ($i+1..$#$substrings) { + $contextr .= $substrings->[$j]->[1]; + $contextr_orig .= $substrings->[$j]->[2]; + } + my $action = prompt_unknown_word($contextl, $contextl_orig, + $word->[1], $contextr, $contextr_orig, + $config->{"table_paths"}, "$cur_lineno" + ); + # if $ret == 2, config could not be loaded + # if $ret == 1, line must be redone with new config + my $ret = handle_unknown_word_action($action, $config, $args); + # keep retrying until the user chooses an action which + # didn't throw an error + while ($ret == 2) { + $action = prompt_unknown_word($contextl, $contextl_orig, + $word->[1], $contextr, $contextr_orig, + $config->{"table_paths"}, "$cur_lineno", 1); + $ret = handle_unknown_word_action($action, $config, $args); + } + # re-transliterate the line with the new config + if ($ret == 1) { + return 1; + } + } + } + return 0; +} + +# Main replacement function +# Opens the input file ($args->{"input"}) and writes the transliterated text +# to the file handle $outputfh, prompting the user for unknown words or +# word choices (if those aren't disabled on the command line) +sub replace { + my ($config, $args, $outputfh) = @_; + # Is there *really* no more efficient way to get the total number of lines? + open my $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n"; + my $total_lines = 0; + while (<$fh>) {$total_lines++}; + close $fh; + open $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n"; + while (my $line = <$fh>) { + next if $. < $args->{"start"}; + my $nfd_line = NFD($line); + my $substrings = replace_line($config, $nfd_line); + + if (!$args->{"nounknowns"}) { + # re-transliterate the string if the config was reloaded + while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) { + $substrings = replace_line($config, $nfd_line); + } + } elsif ($args->{"debug"}) { + foreach my $s (@$substrings) { + if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) { + warn "Unknown word: \"$s->[1]\"\n"; + } + } + } + if (!$args->{"nochoices"}) { + prompt_choose_word($substrings, "$./$total_lines", $config); + } elsif ($args->{"debug"}) { + foreach my $s (@$substrings) { + if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) { + my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1]; + warn "Word \"$s->[1]\" with $num_choices word choices.\n"; + } + } + } + + foreach (@$substrings) { + print $outputfh $_->[1]; + } + } + close $fh; +} + +my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => ""); +GetOptions( + \%args, "debug", + "nochoices", "nounknowns", + "force", "start=i", + "output=s", "config=s", + "errors=s", "help", + "checkduplicates") or pod2usage(1); + +pod2usage(-exitval => 0, -verbose => 2) if $args{"help"}; +pod2usage(1) if $#ARGV != 0 && !$args{"checkduplicates"}; + +if (!-f $args{"config"}) { + die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n"; +} +my $config = load_config \%args; +if (!$config) { + die "ERROR: Invalid config\n"; +} +exit 0 if ($args{"checkduplicates"}); + +my $input = $ARGV[0]; +if (!-f $input) { + die "ERROR: input file \"$input\" does not exist or is not a file.\n"; +} +$args{"input"} = $input; + +if (-f $args{"errors"} && !$args{"force"}) { + my $choice = ""; + while ($choice !~ /^[yn]$/) { + print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? "; + $choice = <STDIN>; + chomp $choice; + } + die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y"; +} + +my $outputfh; +if ($args{"output"} eq "") { + warn "WARNING: no output file supplied; printing to STDOUT\n"; + open $outputfh, ">&STDOUT"; +} elsif (-f $args{"output"} && !$args{"force"}) { + my $choice = ""; + while ($choice !~ /^[aoe]$/) { + print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? "; + $choice = <STDIN>; + chomp $choice; + } + if ($choice eq "a") { + open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; + } elsif ($choice eq "e") { + die "ERROR: \"$args{output}\" already exists.\n"; + } else { + open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; + } +} else { + open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; +} + +replace($config, \%args, $outputfh); +close $outputfh; + +__END__ + +=head1 NAME + +transliterate.pl - Transliterate text files + +=head1 SYNOPSIS + +transliterate.pl [options][input file] + +Start the transliteration engine with the given file as input. + +=head1 OPTIONS + +=over 8 + +=item B<< --output <filename> >> + +Sets the output file to print to. + +If the file exists already and C<--force> is not set, the user is asked +if the file should be overwritten or appended to. + +B<Default:> STDOUT (print to terminal) + +=item B<< --config <filename> >> + +Sets the configuration file to use. + +B<Default:> "config" + +=item B<--checkduplicates> + +Prints all duplicate words within single table files and across tables +that are replaced within the same group, then exits the program. + +Note that this simply prints B<all> duplicates, even ones that are +legitimate. When duplicates are found during normal operation of +the program, they are simply combined in exactly the same way as the +regular word choices. + +=item B<--nochoices> + +Disables prompting for the right word when multiple replacement words exist. + +This can be used to "weed out" all the unknown words before +commencing the laborious task of choosing the right word every time +multiple options exist. + +=item B<--nounknowns> + +Disables prompting for the right word when a word is not found in the database. + +This can be used together with B<--nochoices> to perform a quick test of how +well the actual engine is working without having to click through all the +prompts. + +=item B<--debug> + +This option is only useful for automatic testing of the transliteration engine. + +If C<--nochoices> is enabled, each word in the input with multiple choices will +be output, along with the number of choices (can be used to test the proper +functioning of C<choicesep> in the config file). + +If C<--nounknowns> is enabled, each unknown word in the input is printed +(can be used to test that the C<ignore> options are working correctly). + +=item B<--force> + +Always overwrites the output and error file without asking. + +=item B<< --start <line number> >> + +Starts at the given line number instead of the beginning of the file. + +Note: when "Stop processing" is pressed, the current line number is +printed out. This is the current line that was being processed, so it +has not been printed to the output file yet and thus the program must +be resumed at that line, not the one afterwards. + +=item B<< --errors <filename> >> + +Specifies a file to write errors in. Note that this does not refer to +actual errors, but to any words that were temporarily ignored +(i.e. words for which "Ignore: This run" was clicked). + +If no file is specified, nothing is written. If a file is specified +that already exists and C<--force> is not set, the user is prompted +for action. + +=item B<--help> + +Display the full documentation. + +=back + +=head1 DESCRIPTION + +B<transliterate.pl> will read the given input file and transliterate it +based on the given configuration file, prompting the user for action if +a word has multiple replacement options or is not found in the database. + +See L</"CONFIGURATION"> for details on what is possible. + +=head1 WORD CHOICE WINDOW + +The word choice window is opened any time one word has multiple replacement +options and prompts the user to choose one. + +For each word with multiple options, the user must choose the right option +and then press "Accept changes" to finalize the transliteration of the +current line. Before the line is finalized, the user may press "Undo" to +undo any changes on the current line. + +"Stop processing" will exit the program and print the line number that was +currently being processed. + +=head1 UNKNOWN WORD WINDOW + +The unknown word window is opened any time a word could not be replaced. + +Both the context from the original language and the context from the +transliterated version (so far) is shown. If a part of the text is +selected in one of the text boxes and "Use selection as word" is +pressed for the appropriate box, the selected text is used for the +action that is taken subsequently. + +The possible actions are: + +=over 8 + +=item Ignore + +"This run" only ignores the word until the program exist, while +"Permanently" saves the word in the ignore file specified in the +configuration. + +=item Add to list + +Adds the word typed in the text box beside "Add replacement" to the +selected table file and re-runs the replacement on the current line. + +The filtering for which table files are actually shown here is +currently a bit rudimentary. First, all paths that are used in the +C<table> statements in the config are put into a list. Then, the +paths corresponding to any tables used for word endings in the +C<expand> statements are removed from the list, and that is what +is shown in this window. The reason for removing those paths from +the list is that it gets somewhat confusing when all the tables +that are only used for word endings are also in the list, and it +is very unlikely that an unknown word would need to be written to +one of those files. If necessary, the word can always be added +manually and the config reloaded. Note also that only actual +table paths are shown here, not the tables themselves - this is +not necessarily a one-to-one mapping since new tables can be +generated with the C<expand> statements. + +=item Reload config + +Reload the configuration file along with all tables an re-runs the +replacement on the current line. Note that this can take a short +while since the entire word database has to be reloaded. + +=item Stop processing + +Prints the current line number to the terminal and exits the program. + +=back + +=head1 CONFIGURATION + +These are the commands accepted in the configuration file. +Any parameters in square brackets are optional. + +The C<match>, C<matchignore>, and C<replace> commands are executed in +the order they are specified, except that all C<replace> commands within +the same group are replaced together. + +Note that any duplicate words found will cause the user to be prompted +to choose one option every time the word is replaced in the input text. + +=over 8 + +=item B<split> <regex string> + +Sets the RegEx string to be used for splitting words. This is only used +for splitting the words which couldn't be replaced after all replacement +has been done, before prompting the user for unknown words. + +Note that C<split> should probably always contain at least C<\n>, since +otherwise all of the newlines will be marked as unknown words. Usually, +this will be included anyways through C<\s>. + +Note also that C<split> should probably include the C<+> RegEx-quantifier +since that allows the splitting function in the end to ignore several +splitting characters right after each other (e.g. several spaces) in one +go instead of splitting the string again for every single one of them. +This shouldn't actually make any difference functionality-wise, though. + +B<Default:> \s (all whitespace) + +=item B<beforeword> <regex string> + +Sets the RegEx string to be matched before a word if C<beginword> is set. + +B<Default:> \s + +=item B<afterword> <regex string> + +Sets the RegEx string to be matched after a word if C<endword> is set. + +Note that C<afterword> should probably always contain at least C<\n>, +since otherwise words with C<endword> set will not be matched at the +end of a line. + +C<beforeword> and C<afterword> will often be exactly the same, but +they are left as separate options in case more fine-tuning is needed. + +B<Default:> \s + +=item B<tablesep> <string> + +Sets the separator used to split the lines in the table files into the +original and replacement word. + +B<Default:> Tab + +=item B<choicesep> <string> + +Sets the separator used to split replacement words into multiple choices for +prompting the user. + +B<Default:> $ + +=item B<ignore> <filename> + +Sets the file of words to ignore. + +This has to be set even if the file is just empty because the user can +add words to it from the unknown word window. + +=item B<table> <table identifier> <filename> + +Load the table from C<< <filename> >>, making it available for later use in the +C<expand> and C<replace> commands using the identifier C<< <table identifier> >>. + +Note that if C<< <filename> >> is not an absolute path, it is taken to be relative +to the location of the configuration file. + +The table files simply consist of C<tablesep>-separated values, with the word in the +original language first and the replacement word second. The replacement word +can optionally have several parts separated by C<choicesep>, which will cause the +user to be prompted to choose one of the options. + +=item B<expand> <table identifier> <new table identifier> <word ending table> [noroot] + +Expand the table C<< <table identifier> >>, i.e. generate all the word forms using +the word endings in C<< <word ending table> >>, saving the result as a table with the +identifier C<< <new table identifier> >>. If C<same> is specified as C<< <new table +identifier> >>, the original C<< <table identifier> >> is used instead. + +If C<noroot> is set, the root forms of the words are not kept. + +If the replacement for a word ending contains C<choicesep>, it is split and each part +is combined with the root form separately and the user is prompted to choose one of +the options later. it is thus possible to allow multiple choices for the ending if +there is a distinction in the replacement language but not in the source language. +Note that each of the root words is also split into its choices (if necessary) +during the expanding, so it is possible to use C<choicesep> in both the endings +and root words. + +Note that the paths of all tables used for <word ending table> are removed from the +list of paths that is later shown in the unknown word window. See L</"UNKNOWN WORD +WINDOW"> for details. + +=item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal] + +Perform a RegEx match using the given C<< <regex string> >>, replacing it with +C<< <replacement string> >>. Note that the replacement cannot contain any RegEx +(e.g. groups) in it. C<beginword> and C<endword> specify whether the match must +be at the beginning or ending of a word, respectively, using the RegEx specified +in C<beforeword> and C<afterword>. If C<nofinal> is set, the string is not marked +as transliterated after the replacement, allowing it to be modified by subsequent +C<match> or C<replace> commands. + +=item B<matchignore> <regex string> [beginword] [endword] + +Performs a RegEx match in the same manner as C<match>, except that the original +match is used as the replacement instead of specifying a replacement string. + +=item B<group> [beginword] [endword] + +Begins a replacement group. All C<replace> commands must occur between C<group> +and C<endgroup>, since they are then grouped together and replaced in one go. +C<beginword> and C<endword> act in the same way as specified for C<match> and +apply to all C<replace> statements in this group. + +=item B<replace> <table identifier> + +Replace all words in the table with the identifier C<< <table identifier> >>, +using the C<beginword> and C<endword> settings specified by the current group. + +Note that a table must have been loaded (or generated using C<expand>) +before being used in a C<replace> statement. + +=item B<endgroup> + +End a replacement group. + +=back + +=head1 SEE ALSO + +perlre, perlretut + +=head1 LICENSE + +Copyright (c) 2019, 2020 lumidify <nobody[at]lumidify.org> + +Permission to use, copy, modify, and distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +=cut