transliterate.pl (83637B)
1 #!/usr/bin/env perl 2 3 # Proudly written using vi (OpenBSD nvi) 4 5 # NOTE: If you're wondering why the error codes used by the functions are so 6 # inconsistent, go ask my former self 7 8 # NOTE 2: This codebase has grown as new features were needed, so it's quite 9 # ugly now, but I haven't had time to clean it up. 10 11 use strict; 12 use warnings; 13 use utf8; 14 use feature 'unicode_strings'; 15 use open qw< :encoding(UTF-8) >; 16 binmode STDIN, ":encoding(UTF-8)"; 17 binmode STDOUT, ":encoding(UTF-8)"; 18 binmode STDERR, ":encoding(UTF-8)"; 19 use Unicode::Normalize; 20 use Glib qw/TRUE FALSE/; 21 use Gtk3 '-init'; 22 use Getopt::Long; 23 use Pod::Usage; 24 use Scalar::Util qw(weaken); 25 use File::Basename qw(dirname); 26 use File::Spec::Functions qw(rel2abs file_name_is_absolute); 27 28 # takes a string of words separated by '$config->{choicesep}' and returns a new string in the 29 # same format with duplicates removed 30 sub get_unique_words { 31 my ($word, $config) = @_; 32 my %tmp; 33 my @words_uniq = grep !$tmp{$_}++, split /\Q$config->{choicesep}\E/, $word; 34 return join $config->{choicesep}, @words_uniq; 35 } 36 37 # Adds all words in $words to $trie 38 # Automatically combines duplicate words with "$config->{choicesep}" inbetween 39 sub add_to_trie { 40 my ($table_name, $trie, $words, $args, $config, $override) = @_; 41 foreach my $word (keys %$words) { 42 my $cur_node = $trie; 43 foreach my $char (split //, $word) { 44 if (!exists($cur_node->{$char})) { 45 $cur_node->{$char}->{"parent"} = $cur_node; 46 # This is required to avoid circular references 47 # (otherwise, the garbage collector doesn't ever 48 # destroy these nodes, leading to the memory 49 # consumption growing without restraint if 50 # "Reload config" is used) 51 weaken($cur_node->{$char}->{"parent"}); 52 } 53 $cur_node = $cur_node->{$char}; 54 } 55 if (exists($cur_node->{"final"})) { 56 if ($override) { 57 $cur_node->{"final"} = $words->{$word}; 58 next; 59 } 60 if ($args->{"checkduplicates"}) { 61 warn "WARNING: Duplicate word \"$word\". Last occurrence as " . 62 "\"$cur_node->{final}\" in table \"$cur_node->{table_name}\", " . 63 "current occurrence as \"$words->{$word}\" in " . 64 "table \"$table_name\.\n"; 65 } 66 $cur_node->{"final"} = get_unique_words($cur_node->{"final"} . $config->{choicesep} . $words->{$word}, $config); 67 } else { 68 $cur_node->{"final"} = $words->{$word}; 69 if ($args->{"checkduplicates"}) { 70 $cur_node->{"table_name"} = $table_name; 71 } 72 } 73 } 74 } 75 76 # Prompt user when no replacement has been found for a word 77 # $word is the word that was not found and $context* the context, 78 # $word_repl is the replacement word - this is only used when the window is 79 # called from the word choice window, since the original and 80 # replacement aren't the same then 81 # with context*_orig being the original, non-transliterated context. 82 # $table_paths is a mapping of table paths (here, only the keys, i.e. 83 # the actual paths, are used) to allow the user to choose a table to 84 # save a new replacement to. 85 # $cur_lineno is a display string to show the current line number 86 # $config_error is an optional flag to specify whether an error 87 # message should be displayed, informing the user that the config 88 # could not be loaded (used when "Reload config" is clicked) 89 # Returns: an array reference containing an action to be taken, 90 # in the form ["<action name>", <optional args>]. 91 # See `handle_unknown_word_action` for currently accepted values 92 sub prompt_unknown_word { 93 # yes, this function really should take fewer arguments... 94 # it would be better to just pass the substrings and an index 95 my ($contextl, $contextl_orig, $word_repl, $word, $contextr, $contextr_orig, 96 $config, $cur_lineno, $args, $config_error) = @_; 97 my $action; 98 my $stop = 0; 99 100 my $window = Gtk3::Window->new('toplevel'); 101 $window->signal_connect(delete_event => sub {return FALSE}); 102 $window->signal_connect(destroy => sub { Gtk3->main_quit; }); 103 $window->set_border_width(10); 104 105 my $vbox = Gtk3::VBox->new(FALSE, 10); 106 107 my $linelabel = Gtk3::Label->new("Current line: $cur_lineno"); 108 $vbox->pack_start($linelabel, FALSE, FALSE, 0); 109 $linelabel->show; 110 111 my $wordlabel = Gtk3::Label->new("Word not found: $word"); 112 $wordlabel->set_alignment(0.0, 0.0); 113 $vbox->pack_start($wordlabel, FALSE, FALSE, 0); 114 $wordlabel->show; 115 116 # Make a text box with the given left and right context and label 117 # Also creates a button allowing the user to set the currently 118 # selected text as the word to be replaced - useful when only part 119 # of the entire word that was not found has to be replaced 120 my $make_context_box = sub { 121 # look, don't blame me for these miserably named variables... 122 my ($ctxtl, $wrd, $ctxtr, $lbl, $btn_lbl) = @_; 123 my $hbox = Gtk3::HBox->new(FALSE, 5); 124 my $label = Gtk3::Label->new_with_mnemonic($lbl); 125 my $text = Gtk3::TextView->new; 126 $label->set_mnemonic_widget($text); 127 $text->set_wrap_mode("word"); 128 my $buffer = $text->get_buffer(); 129 $buffer->set_text($ctxtr); 130 my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow"); 131 my $start_iter = $buffer->get_start_iter(); 132 $buffer->insert_with_tags($start_iter, $wrd, $highlight); 133 $start_iter = $buffer->get_start_iter(); 134 $buffer->insert($start_iter, $ctxtl); 135 # set cursor position to beginning of word 136 my $cur_iter = $buffer->get_iter_at_offset(length($ctxtl)); 137 $buffer->place_cursor($cur_iter); 138 my $button = Gtk3::Button->new($btn_lbl); 139 $button->signal_connect( 140 clicked => sub { 141 if (my ($start, $end) = $buffer->get_selection_bounds()) { 142 $word = $buffer->get_text($start, $end, FALSE); 143 $wordlabel->set_text("Selected: $word"); 144 } 145 }, $window); 146 $hbox->pack_start($label, FALSE, FALSE, 0); 147 $hbox->pack_start($text, TRUE, TRUE, 0); 148 $vbox->pack_start($hbox, FALSE, FALSE, 0); 149 $hbox = Gtk3::HBox->new(FALSE, 5); 150 $hbox->pack_start($button, FALSE, FALSE, 0); 151 my $complete_text = $ctxtl . $wrd . $ctxtr; 152 $button = Gtk3::Button->new("Reset text"); 153 $button->signal_connect( 154 clicked => sub { 155 $buffer->set_text($complete_text); 156 }, $window); 157 $hbox->pack_start($button, FALSE, FALSE, 0); 158 $vbox->pack_start($hbox, FALSE, FALSE, 0); 159 }; 160 $make_context_box->( 161 $contextl_orig, $word, $contextr_orig, 162 "Ori_ginal context: ", 163 "Use _original selection as word" 164 ); 165 $make_context_box->( 166 $contextl, $word_repl, $contextr, 167 "Transliterated _context: ", 168 "Use _transliterated selection as word" 169 ); 170 171 my $hbox = Gtk3::HBox->new(FALSE, 5); 172 my $label = Gtk3::Label->new("Ignore: "); 173 $hbox->pack_start($label, FALSE, FALSE, 0); 174 my $button = Gtk3::Button->new("Th_is run"); 175 $button->signal_connect( 176 clicked => sub { 177 $action = ["ignore", "run", $word]; 178 $window->destroy; 179 }, $window); 180 $hbox->pack_start($button, FALSE, FALSE, 0); 181 $button = Gtk3::Button->new("_Permanently"); 182 $button->signal_connect( 183 clicked => sub { 184 $action = ["ignore", "permanent", $word]; 185 $window->destroy; 186 }, $window); 187 $hbox->pack_start($button, FALSE, FALSE, 0); 188 $button = Gtk3::Button->new("_Whole line"); 189 $button->signal_connect( 190 clicked => sub { 191 $action = ["ignore", "wholeline"]; 192 $window->destroy; 193 }, $window); 194 $hbox->pack_start($button, FALSE, FALSE, 0); 195 $vbox->pack_start($hbox, FALSE, FALSE, 0); 196 197 # AHHHH! IT BURNS!!! THE CODE IS SO HORRIBLE! 198 # Take note, kids - this is what happens when you keep adding 199 # features without rethinking your basic design. 200 201 $hbox = Gtk3::HBox->new(FALSE, 5); 202 $label = Gtk3::Label->new_with_mnemonic("Add to _list: "); 203 $hbox->pack_start($label, FALSE, FALSE, 0); 204 my $path_list = Gtk3::ComboBoxText->new; 205 $label->set_mnemonic_widget($path_list); 206 foreach my $path (sort keys %{$config->{"display_tables"}}) { 207 $path_list->append_text($path); 208 } 209 $hbox->pack_start($path_list, FALSE, FALSE, 0); 210 $vbox->pack_start($hbox, FALSE, FALSE, 0); 211 212 $hbox = Gtk3::HBox->new(FALSE, 5); 213 $label = Gtk3::Label->new_with_mnemonic("_Replacement: "); 214 $hbox->pack_start($label, FALSE, FALSE, 0); 215 my $replace_entry = Gtk3::Entry->new; 216 $label->set_mnemonic_widget($replace_entry); 217 $hbox->pack_start($replace_entry, TRUE, TRUE, 0); 218 $vbox->pack_start($hbox, FALSE, FALSE, 0); 219 220 if (exists $config->{"retrywithout"}) { 221 $hbox = Gtk3::HBox->new(FALSE, 5); 222 # Pressing Alt+e just activates the first of the retrywithout buttons 223 $label = Gtk3::Label->new_with_mnemonic("Retry without: "); 224 $hbox->pack_start($label, FALSE, FALSE, 0); 225 foreach my $without (@{$config->{"retrywithout"}}) { 226 $button = Gtk3::Button->new("$without->[0]"); 227 $button->signal_connect( 228 clicked => sub { 229 my @chars = @{$without}[1..$#$without]; 230 my $stripped = replace_strip_chars($config, $args, \@chars, $word); 231 # recombine substrings 232 my $repl_text = ""; 233 $repl_text .= $_->[1] foreach @$stripped; 234 $replace_entry->set_text($repl_text); 235 }, $window); 236 $hbox->pack_start($button, FALSE, FALSE, 0); 237 } 238 $vbox->pack_start($hbox, FALSE, FALSE, 0); 239 } 240 241 $hbox = Gtk3::HBox->new(FALSE, 0); 242 $button = Gtk3::Button->new("_Add replacement"); 243 $button->signal_connect( 244 clicked => sub { 245 if ($path_list->get_active != -1) { 246 $action = ["add", $word, $replace_entry->get_text, $path_list->get_active_text]; 247 $window->destroy; 248 } 249 }, $window); 250 $hbox->pack_start($button, FALSE, FALSE, 0); 251 $vbox->pack_start($hbox, FALSE, FALSE, 0); 252 253 $hbox = Gtk3::HBox->new(FALSE, 5); 254 $button = Gtk3::Button->new("_Stop processing"); 255 $button->signal_connect( 256 clicked => sub { 257 $stop = 1; 258 $window->destroy; 259 }, $window); 260 $hbox->pack_start($button, FALSE, FALSE, 0); 261 262 $button = Gtk3::Button->new("Reload config"); 263 $button->signal_connect( 264 clicked => sub { 265 $action = ["reload"]; 266 $window->destroy; 267 }, $window); 268 $hbox->pack_start($button, FALSE, FALSE, 0); 269 270 if ($config_error) { 271 $label = Gtk3::Label->new("Error loading config; see terminal output for details"); 272 $hbox->pack_start($label, FALSE, FALSE, 0); 273 } 274 $vbox->pack_start($hbox, FALSE, FALSE, 0); 275 276 $window->add($vbox); 277 $window->show_all; 278 Gtk3->main; 279 280 die "Processing stopped at line $cur_lineno\n" if $stop; 281 282 if (!$action) { 283 # This action isn't explicitly handled, but at least nothing 284 # breaks when the window is closed without selecting an action 285 $action = ["dummy"]; 286 } 287 return $action; 288 } 289 290 # Prompt the user when a word has multiple replacement options (separated by $config->{choicesep}) 291 # $cur_lineno - display string to show the current line number 292 # Returns: 293 # 3, if this window needs to be called again but nothing needs 294 # to be re-transliterated 295 # 1, if the line needs to be re-transliterated 296 # 0, if nothing needs to be done 297 sub prompt_choose_word { 298 my ($substrings, $config, $args, $cur_lineno) = @_; 299 300 # make a list of all substrings that contain multiple word options 301 my @replacements; 302 foreach (0..$#$substrings) { 303 if ($substrings->[$_]->[0] && $substrings->[$_]->[1] =~ /\Q$config->{choicesep}\E/) { 304 # This ugly bit of code is here as a special case for transliterating 305 # Hindi to Urdu text - if there are *exactly* two choices and one 306 # contains diacritics but the other one doesn't, the one with diacritics 307 # is automatically used instead of prompting the user. 308 if (exists $config->{"targetdiacritics"}) { 309 my @choices = split /\Q$config->{choicesep}\E/, $substrings->[$_]->[1]; 310 my @diacritics = @{$config->{"targetdiacritics"}}; 311 if (@choices == 2) { 312 @choices = map {NFD($_)} @choices; 313 my $first_matches = grep {$choices[0] =~ /$_/} @diacritics; 314 my $second_matches = grep {$choices[1] =~ /$_/} @diacritics; 315 if ($first_matches && !$second_matches) { 316 $substrings->[$_]->[1] = $choices[0]; 317 next; 318 } elsif (!$first_matches && $second_matches) { 319 $substrings->[$_]->[1] = $choices[1]; 320 next; 321 } 322 } 323 } 324 # Format of the elements in @replacements: 325 # [<id of substrings in $substrings>, <replacement word>, <original string>] 326 push @replacements, [$_, $substrings->[$_]->[1], $substrings->[$_]->[1]]; 327 } 328 } 329 # no substrings have multiple options 330 return if (!@replacements); 331 332 my $stop = 0; 333 my $open_unknown = 0; 334 my $cur_replacement = 0; 335 336 my $window = Gtk3::Window->new('toplevel'); 337 $window->signal_connect(delete_event => sub {return FALSE}); 338 $window->signal_connect(destroy => sub { Gtk3->main_quit; }); 339 $window->set_border_width(10); 340 341 my $vbox = Gtk3::VBox->new(FALSE, 0); 342 343 my $linelabel = Gtk3::Label->new("Current line: $cur_lineno"); 344 $vbox->pack_start($linelabel, FALSE, FALSE, 0); 345 346 my $wordlabel = Gtk3::Label->new(""); 347 $wordlabel->set_alignment(0.0, 0.0); 348 $vbox->pack_start($wordlabel, FALSE, FALSE, 0); 349 350 my $undo = Gtk3::Button->new("_Undo"); 351 $vbox->pack_start($undo, FALSE, FALSE, 0); 352 $undo->set_sensitive(FALSE); 353 354 my $button_vbox = Gtk3::VBox->new(FALSE, 0); 355 $vbox->pack_start($button_vbox, FALSE, FALSE, 0); 356 357 my $accept = Gtk3::Button->new("_Accept changes?"); 358 $vbox->pack_start($accept, FALSE, FALSE, 0); 359 360 my $hbox = Gtk3::HBox->new(FALSE, 0); 361 my $label = Gtk3::Label->new("Context: "); 362 my $text = Gtk3::TextView->new; 363 $text->set_wrap_mode("word"); 364 my $buffer = $text->get_buffer(); 365 my $highlight = $buffer->create_tag("yellow_bg", "background", "yellow"); 366 $text->set_editable(FALSE); 367 $hbox->pack_start($label, FALSE, FALSE, 0); 368 $hbox->pack_start($text, TRUE, TRUE, 10); 369 $vbox->pack_start($hbox, FALSE, FALSE, 10); 370 371 $hbox = Gtk3::HBox->new(FALSE, 5); 372 my $skip_button = Gtk3::Button->new("Sk_ip word"); 373 $hbox->pack_start($skip_button, FALSE, FALSE, 0); 374 my $unknown_button = Gtk3::Button->new("_Open in unknown word window"); 375 $hbox->pack_start($unknown_button, FALSE, FALSE, 0); 376 my $stop_button = Gtk3::Button->new("_Stop processing"); 377 $hbox->pack_start($stop_button, FALSE, FALSE, 0); 378 $vbox->pack_start($hbox, FALSE, FALSE, 0); 379 380 # generate the context to the left and to the right of the current word being replaced 381 my $get_context = sub { 382 my ($contextl, $contextr) = ("", ""); 383 my $tmp_replacement = 0; 384 foreach (0..$#$substrings) { 385 my $word = $substrings->[$_]->[1]; 386 if ($tmp_replacement <= $#replacements && $replacements[$tmp_replacement]->[0] == $_) { 387 $word = $replacements[$tmp_replacement]->[1]; 388 $tmp_replacement++; 389 } 390 # When nothing is left to replace, the entire string is in $contextl 391 if ($cur_replacement > $#replacements || $_ < $replacements[$cur_replacement]->[0]) { 392 $contextl .= $word; 393 } elsif ($_ > $replacements[$cur_replacement]->[0]) { 394 $contextr .= $word; 395 } 396 } 397 return ($contextl, $contextr); 398 }; 399 400 # fill the text buffer with the context and current word, highlighting the word 401 # if $cur_replacement is after the end of @replacements, don't highlight anything 402 # (this happens when all words have been replaced and the user only needs to accept the changes) 403 my $fill_text_buffer = sub { 404 my $start = $buffer->get_start_iter(); 405 my $end = $buffer->get_end_iter(); 406 $buffer->delete($start, $end); 407 my ($contextl, $contextr) = $get_context->(); 408 $buffer->set_text($contextr); 409 if ($cur_replacement <= $#replacements) { 410 $start = $buffer->get_start_iter(); 411 $buffer->insert_with_tags($start, $replacements[$cur_replacement]->[1], $highlight); 412 } 413 $start = $buffer->get_start_iter(); 414 $buffer->insert($start, $contextl); 415 }; 416 417 my $show_accept = sub { 418 $button_vbox->foreach(sub {my $child = shift; $child->destroy();}); 419 $accept->show; 420 $accept->grab_focus; 421 $wordlabel->set_text(""); 422 $skip_button->set_sensitive(FALSE); 423 $unknown_button->set_sensitive(FALSE); 424 }; 425 426 my $fill_button_vbox; # forward-declaration so it can be used here already 427 my $next_word = sub { 428 $undo->set_sensitive(TRUE); 429 $cur_replacement++; 430 $fill_text_buffer->(); 431 if ($cur_replacement > $#replacements) { 432 $show_accept->(); 433 return; 434 } 435 $fill_button_vbox->(); 436 }; 437 438 # fill $button_vbox with the word options for the current word 439 $fill_button_vbox = sub { 440 $button_vbox->foreach(sub {my $child = shift; $child->destroy();}); 441 my $word = $replacements[$cur_replacement]->[1]; 442 $wordlabel->set_text("Word \"$word\" has multiple replacement options:"); 443 my @choices = split /\Q$config->{choicesep}\E/, $replacements[$cur_replacement]->[1]; 444 if (exists $config->{"targetdiacritics"}) { 445 # This nasty bit of code finds the number of diacritics in every 446 # choice and sorts the choice in descending order based on that 447 my %choice_nums; 448 foreach my $choice (@choices) { 449 $choice_nums{$choice} = 0; 450 foreach my $diacritic (@{$config->{"targetdiacritics"}}) { 451 my @matches = NFD($choice) =~ /$diacritic/; 452 $choice_nums{$choice} += scalar @matches if @matches; 453 } 454 } 455 @choices = sort {$choice_nums{$b} <=> $choice_nums{$a}} @choices; 456 } 457 my $cur_num = 1; 458 foreach my $word_choice (@choices) { 459 # the mnemonics don't make sense when the number has more than one digit 460 my $choice_label = $cur_num <= 9 ? "_$cur_num: $word_choice" : $word_choice; 461 my $button = Gtk3::Button->new($choice_label); 462 $button->signal_connect( 463 clicked => sub { 464 $replacements[$cur_replacement]->[1] = $word_choice; 465 $next_word->(); 466 }, $window); 467 $button_vbox->pack_start($button, FALSE, FALSE, 0); 468 $button->show; 469 $cur_num++; 470 } 471 }; 472 473 $undo->signal_connect( 474 clicked => sub { 475 if ($cur_replacement > 0) { 476 $cur_replacement--; 477 if ($cur_replacement == 0) { 478 $undo->set_sensitive(FALSE); 479 } 480 $replacements[$cur_replacement]->[1] = $replacements[$cur_replacement]->[2]; 481 $fill_button_vbox->(); 482 $fill_text_buffer->(); 483 $accept->hide; 484 $skip_button->set_sensitive(TRUE); 485 $unknown_button->set_sensitive(TRUE); 486 my $word = $replacements[$cur_replacement]->[1]; 487 $wordlabel->set_text("Word \"$word\" has multiple replacement options:"); 488 } 489 }, $window); 490 491 $accept->signal_connect( 492 clicked => sub { 493 # write the changes to the original $substrings 494 foreach (@replacements) { 495 $substrings->[$_->[0]]->[1] = $_->[1]; 496 } 497 $window->destroy; 498 }, $window); 499 500 $skip_button->signal_connect(clicked => $next_word, $window); 501 502 $unknown_button->signal_connect( 503 clicked => sub { 504 $open_unknown = 1; 505 $window->destroy; 506 }, $window); 507 508 $stop_button->signal_connect( 509 clicked => sub { 510 $stop = 1; 511 $window->destroy; 512 }, $window); 513 514 $fill_button_vbox->(); 515 $fill_text_buffer->(); 516 517 $window->add($vbox); 518 $window->show_all; 519 $accept->hide; 520 Gtk3->main; 521 die "Processing stopped at line $cur_lineno\n" if $stop; 522 if ($open_unknown) { 523 my $ret = call_unknown_word_window( 524 $substrings, $replacements[$cur_replacement]->[0], 525 $config, $args, $cur_lineno); 526 # the word choice window still needs to be called again 527 # when 0 is returned 528 return 3 if $ret == 0; 529 return $ret; 530 } 531 return 0; 532 } 533 534 my $ID = 0; 535 my $STRING = 1; 536 537 # Parse the configuration file into data type (currently only ID and STRING) 538 sub parse_config { 539 my $f = shift; 540 my $fh; 541 if (!open($fh, "<", $f)) { 542 warn "Can't open config file \"$f\"!\n"; 543 return; 544 } 545 my @commands; 546 my $state = 0; 547 my $IN_ID = 1; 548 my $IN_STR = 2; 549 my $cur_val = ""; 550 while (my $line = <$fh>) { 551 chomp($line); 552 $state = 0; 553 push(@commands, []); 554 foreach my $char (split(//, $line)) { 555 if ($char eq "#" && !($state & $IN_STR)) { 556 last; 557 } elsif ($char eq '"') { 558 if ($state & $IN_STR) { 559 push(@{$commands[-1]}, {type => $STRING, value => $cur_val}); 560 $cur_val = ""; 561 $state &= ~$IN_STR; 562 } else { 563 $cur_val = ""; 564 $state |= $IN_STR; 565 } 566 } elsif ($char eq " ") { 567 if ($state & $IN_ID) { 568 push(@{$commands[-1]}, {type => $ID, value => $cur_val}); 569 $state &= ~$IN_ID; 570 $cur_val = ""; 571 } elsif ($state) { 572 $cur_val .= $char; 573 } 574 } else { 575 if (!$state) { 576 $state |= $IN_ID; 577 } 578 $cur_val .= $char; 579 } 580 } 581 if ($state & $IN_STR) { 582 warn "ERROR: Unterminated string in config:\n$line"; 583 return; 584 } elsif ($cur_val) { 585 push(@{$commands[-1]}, {type => $ID, value => $cur_val}); 586 $cur_val = ""; 587 } 588 if ($#{$commands[-1]} == -1) { 589 pop(@commands); 590 } 591 } 592 close($fh); 593 594 return \@commands; 595 } 596 597 # if the path is relative, find its absolute location based 598 # on the location of the config file 599 sub open_file_rel_abs { 600 my ($filename, $config_file, $mode) = @_; 601 $mode //= "<"; 602 if (!file_name_is_absolute $filename) { 603 my $config_dir = dirname $config_file; 604 $filename = rel2abs($filename, $config_dir); 605 } 606 my $fh; 607 if (!open $fh, $mode, $filename) { 608 warn "Can't open file \"$filename\"!\n"; 609 return; 610 } 611 return $fh; 612 } 613 614 # Load a file of replacement words into a hash table 615 sub load_table { 616 my ($filename, $args, $config, $revert) = @_; 617 my $fh = open_file_rel_abs $filename, $args->{"config"}; 618 return if !$fh; 619 my %table; 620 while (my $line = <$fh>) { 621 chomp $line; 622 next if (!$line); 623 my @words = split(/\Q$config->{tablesep}\E/, $line); 624 if (@words != 2) { 625 warn "ERROR: Malformed line in file \"$filename\":\n$line\n"; 626 close $fh; 627 return; 628 } 629 my $word; 630 my $replacement; 631 if ($revert) { 632 $word = NFD $words[1]; 633 $replacement = $words[0]; 634 } else { 635 $word = NFD $words[0]; 636 $replacement = $words[1]; 637 } 638 my @word_choices = split /\Q$config->{choicesep}\E/, $word; 639 foreach my $word_choice (@word_choices) { 640 if (exists $table{$word_choice}) { 641 if ($args->{"checkduplicates"}) { 642 warn "WARNING: Duplicate word in file \"$filename\": " . 643 "\"$word_choice\", with replacement \"$replacement\", " . 644 "already exists with replacement \"$table{$word_choice}\".\n"; 645 } 646 $table{$word_choice} = get_unique_words( 647 $table{$word_choice} . 648 $config->{choicesep} . 649 $replacement, $config); 650 } else { 651 $table{$word_choice} = $replacement; 652 } 653 } 654 } 655 close $fh; 656 return \%table; 657 } 658 659 # Load table for words to ignore - only the keys matter, since there is no replacement 660 sub load_ignore_table { 661 my ($filename, $args) = @_; 662 my $fh = open_file_rel_abs $filename, $args->{"config"}; 663 return if !$fh; 664 my %table; 665 while (my $line = <$fh>) { 666 chomp $line; 667 $table{NFD($line)} = "" if $line; 668 } 669 close $fh; 670 return \%table; 671 } 672 673 # Generate all forms of a word by combining it with endings 674 # Returns: 675 # 0 - an error occurred 676 # 1 - everything's fine 677 sub expand_table { 678 my ($table, $forms, $noroot, $config) = @_; 679 my %new_table; 680 foreach my $word (keys %$table) { 681 foreach my $ending (keys %$forms) { 682 # Some words and/or endings have multiple options, separated by $config->{choicesep} 683 # These must be temporarily separated in order to properly generate the forms 684 my @word_options; 685 my @stem_options = split(/\Q$config->{choicesep}\E/, $table->{$word}); 686 my @ending_options = split(/\Q$config->{choicesep}\E/, $forms->{$ending}); 687 foreach my $stem_option (@stem_options) { 688 foreach my $ending_option (@ending_options) { 689 push(@word_options, $stem_option . $ending_option); 690 } 691 } 692 $new_table{$word . $ending} = join($config->{choicesep}, @word_options); 693 } 694 $new_table{$word} = $table->{$word} if !$noroot; 695 } 696 return \%new_table; 697 } 698 699 # Check if the number and types of arguments given to a config command are right 700 # Returns: 701 # undef - the arguments don't match 702 # 1 - the arguments match 703 sub check_args { 704 my ($args, $cmd) = @_; 705 my $cmd_name = $cmd->[0]->{"value"}; 706 if ($#$cmd - 1 < $#$args) { 707 my $err = "ERROR: not enough arguments for command \"$cmd_name\":"; 708 foreach my $arg (@{$cmd}[1..$#$cmd]) { 709 $err .= " " . $arg->{"value"} 710 } 711 warn "$err\n"; 712 return; 713 } 714 my $arg_num = 0; 715 while ($arg_num <= $#$args) { 716 if ($cmd->[$arg_num + 1]->{"type"} != $args->[$arg_num]) { 717 my $err = "ERROR: argument type mismatch for command \"$cmd_name\".\n"; 718 $err .= "Expected:"; 719 foreach my $arg_type (@$args) { 720 $err .= " ID" if ($arg_type == $ID); 721 $err .= " STRING" if ($arg_type == $STRING); 722 } 723 $err .= "\nReceived:"; 724 foreach my $arg (@{$cmd}[1..$#$cmd]) { 725 $err .= " ID" if ($arg->{"type"} == $ID); 726 $err .= " STRING" if ($arg->{"type"} == $STRING); 727 } 728 warn "$err\n"; 729 return; 730 } 731 $arg_num++; 732 } 733 return 1; 734 } 735 736 # Interpret the config file - load and expand tables, etc. 737 # $config_list - the list returned by parse_config 738 sub interpret_config { 739 my ($config_list, $args) = @_; 740 my %tables; 741 my %config; 742 # table_paths stores a list of all table and replacement ids that are 743 # impacted by the path so the replacement can be added on the fly when 744 # a new replacement is added from the GUI 745 # the "replacement id" is just the number of the replacement group, 746 # starting at 0 with the first group in the config 747 $config{"table_paths"} = {}; 748 # reverted_tables stores a hash of the paths of all tables that are 749 # reverted so that replacements added from the GUI are added in the 750 # right order 751 $config{"reverted_tables"} = {}; 752 # these are the paths of the tables that are displayed in the GUI 753 $config{"display_tables"} = {}; 754 # a mapping between the table ids and tables for all tables used as 755 # ending tables in expand statements - so expansions can be done 756 # on the fly when adding a replacement word from the GUI 757 $config{"ending_tables"} = {}; 758 # ignore is the path to the ignore file, ignore_words the actual table 759 $config{"ignore"} = ""; 760 $config{"ignore_words"} = {}; 761 $config{"split"} = "\\s+"; 762 $config{"beforeword"} = "\\s"; 763 $config{"afterword"} = "\\s"; 764 $config{"tablesep"} = "\t"; 765 $config{"choicesep"} = "\$"; 766 # a list of "replacement configs", which specify the type and any 767 # other arguments (this is given to replace_match, etc. 768 $config{"replacements"} = []; 769 # these are temporary mappings used while loading the config 770 my %path_to_table; 771 my %table_id_to_path; 772 my %mandatory_args = ( 773 "ignore" => [$STRING], 774 "table" => [$ID], 775 "expand" => [$ID, $ID], 776 "match" => [$STRING, $STRING], 777 "matchignore" => [$STRING], 778 "replace" => [$ID], 779 "split" => [$STRING], 780 "beforeword" => [$STRING], 781 "afterword" => [$STRING], 782 "tablesep" => [$STRING], 783 "choicesep" => [$STRING], 784 "comment" => [$STRING], 785 "group" => [], 786 "endgroup" => [], 787 "retrywithout" => [$STRING], 788 "targetdiacritics" => [$STRING] 789 ); 790 my $in_group = 0; 791 foreach my $cmd (@$config_list) { 792 # All load statements must be before expand statements 793 # All expand, beforeword, and afterword statements must be before replace statements 794 my $cmd_name = $cmd->[0]->{"value"}; 795 if ($cmd->[0]->{"type"} == $ID) { 796 if (!exists($mandatory_args{$cmd->[0]->{"value"}})) { 797 warn "ERROR: Unknown command \"" . $cmd->[0]->{"value"} . "\" in config\n"; 798 return; 799 } 800 return if !check_args($mandatory_args{$cmd_name}, $cmd); 801 if ($cmd_name eq "table") { 802 my $table_path = $cmd->[2]->{"value"}; 803 my %table_args; 804 foreach (3..$#$cmd) { 805 $table_args{$cmd->[$_]->{"value"}} = 1; 806 } 807 my $table; 808 # add to temporary path-to-table mapping so tables aren't 809 # loaded unnecessarily 810 if (exists $path_to_table{$table_path}) { 811 $table = $path_to_table{$table_path}; 812 } else { 813 $table = load_table $table_path, $args, \%config, $table_args{"revert"}; 814 return if !defined $table; 815 $path_to_table{$table_path} = $table; 816 } 817 if ($table_args{"revert"}) { 818 $config{"reverted_tables"}->{$table_path} = 1; 819 } 820 my $table_id = $cmd->[1]->{"value"}; 821 $tables{$table_id} = $table; 822 $table_id_to_path{$table_id} = $table_path; 823 # this is a hash to avoid duplicates if the same file 824 # is loaded multiple times 825 $config{"display_tables"}->{$table_path} = 1 if !exists $table_args{"nodisplay"}; 826 } elsif ($cmd_name eq "expand") { 827 my $orig_table_id = $cmd->[1]->{"value"}; 828 my $ending_table_id = $cmd->[2]->{"value"}; 829 my $noroot = 0; 830 if ($#$cmd >= 3 && $cmd->[3]->{"value"} eq "noroot") { 831 $noroot = 1; 832 } 833 if (!exists $tables{$orig_table_id}) { 834 warn "expand: table \"$orig_table_id\" doesn't exist\n"; 835 return; 836 } elsif (!exists $tables{$ending_table_id}) { 837 warn "expand: table \"$ending_table_id\" doesn't exist\n"; 838 return; 839 } 840 841 $config{"ending_tables"}->{$ending_table_id} = $tables{$ending_table_id}; 842 $config{"expands"}->{$orig_table_id} = [] if !exists $config{"expands"}->{$orig_table_id}; 843 push @{$config{"expands"}->{$orig_table_id}}, [$ending_table_id, $noroot]; 844 845 my $new_table = expand_table($tables{$orig_table_id}, $tables{$ending_table_id}, $noroot, \%config); 846 return if !$new_table; 847 $tables{$orig_table_id} = $new_table; 848 } elsif ($cmd_name eq "group") { 849 if ($in_group) { 850 warn "ERROR: New group started without ending last one in config\n"; 851 return; 852 } 853 push @{$config{"replacements"}}, { 854 "type" => "group", "tables" => [], 855 "words" => {}, "options" => {}}; 856 # add all options such as "endword" to the options hash 857 for (1..$#$cmd) { 858 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; 859 } 860 $in_group = 1; 861 } elsif ($cmd_name eq "endgroup") { 862 if (!$in_group) { 863 warn "ERROR: endgroup command called while not in group\n"; 864 return; 865 } 866 $in_group = 0; 867 } elsif ($cmd_name eq "match") { 868 if ($in_group) { 869 warn "ERROR: match command is invalid inside group\n"; 870 return; 871 } 872 push @{$config{"replacements"}}, { 873 "type" => "match", 874 "options" => {}, 875 "search" => NFD($cmd->[1]->{"value"}), 876 "replace" => $cmd->[2]->{"value"}}; 877 for (3..$#$cmd) { 878 # add optional arguments as keys in options hash 879 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; 880 } 881 } elsif ($cmd_name eq "matchignore") { 882 if ($in_group) { 883 warn "ERROR: matchignore command is invalid inside group\n"; 884 return; 885 } 886 push @{$config{"replacements"}}, { 887 "type" => "match", 888 "options" => {}, 889 "search" => NFD($cmd->[1]->{"value"})}; 890 for (2..$#$cmd) { 891 $config{"replacements"}->[-1]->{"options"}->{$cmd->[$_]->{"value"}} = 1; 892 } 893 } elsif ($cmd_name eq "replace") { 894 if (!$in_group) { 895 warn "ERROR: replace command called while not in group\n"; 896 return; 897 } 898 my $table = $cmd->[1]->{"value"}; 899 if (!exists($tables{$table})) { 900 warn "ERROR: nonexistent table \"$table\" in replace statement.\n"; 901 return; 902 } 903 904 # make a list of all replacements that are affected by this 905 # file so that they can be updated when a word is added 906 # through the gui 907 my $table_path = $table_id_to_path{$table}; 908 my $replacement_id = $#{$config{"replacements"}}; 909 $config{"table_paths"}->{$table_path} = [] if !exists $config{"table_paths"}->{$table_path}; 910 push @{$config{"table_paths"}->{$table_path}}, [$replacement_id, $table]; 911 912 # store list of tables for --debug 913 push @{$config{"replacements"}->[-1]->{"tables"}}, $table; 914 915 # Note: we don't need to check if $table{"choicesep"} was defined 916 # here since we can't ever get this far without first having 917 # loaded a table anyways 918 my $trie_root = $config{"replacements"}->[-1]->{"words"}; 919 my $override = $#$cmd >= 2 && $cmd->[2]->{"value"} eq "override"; 920 add_to_trie($table, $trie_root, $tables{$table}, $args, \%config, $override); 921 } elsif ($cmd_name eq "retrywithout") { 922 if (!exists $config{"retrywithout"}) { 923 $config{"retrywithout"} = []; 924 } 925 # first value is the display name 926 my @values = map {$_->{"value"}} @{$cmd}[1..$#$cmd]; 927 push @{$config{"retrywithout"}}, \@values; 928 } elsif ($cmd_name eq "targetdiacritics") { 929 if (!exists $config{$cmd_name}) { 930 $config{$cmd_name} = []; 931 } 932 foreach (1..$#$cmd) { 933 push @{$config{$cmd_name}}, $cmd->[$_]->{"value"}; 934 } 935 } elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" || 936 $cmd_name eq "afterword" || $cmd_name eq "tablesep" || 937 $cmd_name eq "choicesep" || $cmd_name eq "comment") { 938 $config{$cmd_name} = $cmd->[1]->{"value"}; 939 } elsif ($cmd_name eq "ignore") { 940 $config{"ignore"} = $cmd->[1]->{"value"}; 941 my $table = load_ignore_table $cmd->[1]->{"value"}, $args; 942 return if !defined $table; 943 $config{"ignore_words"} = $table; 944 } else { 945 warn "ERROR: unknown command \"" . $cmd_name . "\" in config.\n"; 946 return; 947 } 948 } else { 949 my $err = "ERROR: line does not start with command:\n"; 950 foreach my $cmd_part (@$cmd) { 951 $err .= $cmd_part->{"value"}; 952 } 953 warn "$err\n"; 954 return; 955 } 956 } 957 if ($in_group) { 958 warn "ERROR: unclosed group in config\n"; 959 return; 960 } 961 if (!$config{"ignore"}) { 962 warn "ERROR: no file of words to ignore specified.\n"; 963 return; 964 } 965 if ($args->{"dumptables"}) { 966 foreach my $table_id (keys %tables) { 967 my $table_path = $table_id_to_path{$table_id}; 968 if ($config{"display_tables"}->{$table_path}) { 969 for my $word (keys %{$tables{$table_id}}) { 970 print NFC($word) . "\n"; 971 } 972 } 973 } 974 } 975 return \%config; 976 } 977 978 # load the config file 979 # Returns: 980 # the config hash or undef if an error occurred 981 sub load_config { 982 my $args = shift; 983 my $config_list = parse_config($args->{"config"}); 984 if (!$config_list) { 985 return; 986 } 987 return interpret_config $config_list, $args; 988 } 989 990 # Handle the action returned by `prompt_unknown_word` 991 # $config - the current program config 992 # $args - the command line arguments 993 # Currently accepted values for $action: 994 # ["ignore", "run", $word] - only ignore $word for the rest of this run 995 # ["ignore", "permanent", $word] - write $word to the permanent ignore file 996 # ["add", $word, $replace_word, $table_path] - add $word to the table 997 # corresponding to $table_path with $replace_word as its replacement. Note that 998 # only tables directly corresponding to paths work here - tables that only 999 # were created through "expand" in the config aren't ever shown separately 1000 # in `prompt_unknown_word` 1001 # ["reload"] - reload the configuration file 1002 # Returns: 1003 # 0 - nothing needs to be done 1004 # 1 - the current line needs to be re-transliterated with the new config 1005 # 2 - an error occurred while reloading the config 1006 # 3 - stop asking for unknown words on this line 1007 sub handle_unknown_word_action { 1008 my ($action, $config, $args) = @_; 1009 if ($action->[0] eq "ignore") { 1010 # yeah, this is a bit messy and inconsistent 1011 return 3 if $action->[1] eq "wholeline"; 1012 $config->{"ignore_words"}->{$action->[2]} = ""; 1013 if ($action->[1] eq "permanent") { 1014 my $fh = open_file_rel_abs $config->{"ignore"}, $args->{"config"}, ">>"; 1015 return 1 if !$fh; 1016 print($fh $action->[2] . "\n"); 1017 close($fh); 1018 } elsif ($action->[1] eq "run") { 1019 # Print to error file if ignore isn't permanent 1020 return 0 if ($args->{"errors"} eq ""); 1021 my $fh; 1022 if (!open($fh, ">>", $args->{"errors"})) { 1023 warn "ERROR: Can't open error file \"$args->{errors}\".\n"; 1024 return 0; 1025 } 1026 print($fh $action->[2] . "\n"); 1027 close($fh); 1028 } 1029 return 0; 1030 } elsif ($action->[0] eq "add") { 1031 my $table_path = $action->[3]; 1032 my $word = $action->[1]; 1033 my $replace_word = $action->[2]; 1034 # make sure to write the words in the correct order if the 1035 # tables were reverted while loading 1036 my $reverted = exists $config->{"reverted_tables"}->{$table_path}; 1037 my $word_abs = $reverted ? $action->[2] : $action->[1]; 1038 my $replace_word_abs = $reverted ? $action->[1] : $action->[2]; 1039 my $fh = open_file_rel_abs $table_path, $args->{"config"}, ">>"; 1040 return 1 if !$fh; 1041 print($fh $word_abs . $config->{tablesep} . $replace_word_abs . "\n"); 1042 close($fh); 1043 # loop over all table ids that are impacted by this file 1044 foreach my $replacement (@{$config->{"table_paths"}->{$table_path}}) { 1045 my $replacement_id = $replacement->[0]; 1046 my $table_id = $replacement->[1]; 1047 my $trie = $config->{"replacements"}->[$replacement_id]->{"words"}; 1048 my $final_table = {$word => $replace_word}; 1049 # handle expansions 1050 foreach my $expand (@{$config->{"expands"}->{$table_id}}) { 1051 my $ending_table_id = $expand->[0]; 1052 my $noroot = $expand->[1]; 1053 my $endings_table = $config->{"ending_tables"}->{$ending_table_id}; 1054 $final_table = expand_table $final_table, $endings_table, $noroot, $config; 1055 } 1056 add_to_trie($table_id, $trie, $final_table, $args, $config); 1057 } 1058 return 1; 1059 } elsif ($action->[0] eq "reload") { 1060 my $new_config = load_config $args; 1061 if ($new_config) { 1062 %$config = %$new_config; 1063 return 1; 1064 } else { 1065 return 2; 1066 } 1067 } 1068 } 1069 1070 # Split $substrings based on the "split" regex in $config. 1071 # $substrings can already be split at this point; only the 1072 # ones that haven't been transliterated yet are modified 1073 sub split_words { 1074 my ($config, $substrings) = @_; 1075 my $split_re = qr/($config->{"split"})/; 1076 my @substrings_new; 1077 foreach my $cur_substr (@$substrings) { 1078 if ($cur_substr->[0] == 1) { 1079 push(@substrings_new, $cur_substr); 1080 next; 1081 } 1082 1083 my @words = split(/$split_re/, $cur_substr->[1]); 1084 for my $i (0..$#words) { 1085 # Word is not delimiter 1086 # Split produces an empty field at the beginning if the string 1087 # starts with the delimiter 1088 if ($i % 2 == 0) { 1089 push(@substrings_new, [0, $words[$i], $words[$i]]) if ($words[$i] ne ''); 1090 } else { 1091 # Delimiters can count as already replaced 1092 push(@substrings_new, [1, $words[$i], $words[$i]]); 1093 } 1094 } 1095 } 1096 @$substrings = @substrings_new; 1097 } 1098 1099 # small helper function to add a untransliterated string to the last substring 1100 # if that is not transliterated yet, or push it onto @$substrings otherwise 1101 # -> used to keep all untransliterated text in one piece 1102 # since this is also used for the "nofinal" attribute on "match", it takes 1103 # an original and replaced string (since, when using "match" and "nofinal", 1104 # the original string was replaced, but is still marked as unknown) 1105 sub push_unknown { 1106 my ($substrings, $orig, $replaced) = @_; 1107 $replaced //= $orig; 1108 if (@$substrings && !$substrings->[-1]->[0]) { 1109 $substrings->[-1]->[1] .= $replaced; 1110 $substrings->[-1]->[2] .= $orig; 1111 } else { 1112 push(@$substrings, [0, $replaced, $orig]); 1113 } 1114 } 1115 1116 # Replace a word in $substrings based on $replace_config using regex 1117 # $replace_config->{"search"} is the word to replace 1118 # $replace_config->{"replace"} is the replacement word 1119 # if $replace_config->{"replace"} is undefined, just splits 1120 # $substrings at the the match and marks that the match has 1121 # been transliterated - currently used for "matchignore" 1122 # $replace_config->{"beginword"}, $replace_config->{"afterword"} - 1123 # specifies if the match is only valid when $config->{"beforeword"} 1124 # or $config->{"afterword"} occur before or after it, respectively 1125 sub replace_match { 1126 my ($config, $replace_config, $substrings, $debug_msg) = @_; 1127 my $beginword = exists $replace_config->{"options"}->{"beginword"}; 1128 my $endword = exists $replace_config->{"options"}->{"endword"}; 1129 my $fullword = $beginword && $endword; 1130 my $beforeword = $config->{"beforeword"}; 1131 my $afterword = $config->{"afterword"}; 1132 my $word = $replace_config->{"search"}; 1133 my $replace_word = $replace_config->{"replace"}; 1134 if ($fullword) { 1135 $word = qr/(\A|$beforeword)$word(\z|$afterword)/; 1136 } elsif ($beginword) { 1137 $word = qr/(\A|$beforeword)$word/; 1138 } elsif ($endword) { 1139 $word = qr/$word(\z|$afterword)/; 1140 } else { 1141 $word = qr/$word/; 1142 } 1143 1144 my @substrings_new; 1145 # only modify $substrings at all if the word was found 1146 my $found_word = 0; 1147 my $last_idx; 1148 # @substrings_new is only used if needed to improve efficiency 1149 foreach my $i (0..$#$substrings) { 1150 if ($substrings->[$i]->[0]) { 1151 # FIXME: is there a way to make it more efficient by keeping the old array? 1152 # This is a major bottleneck 1153 # Note: the above statement *may* be a bit exaggerated 1154 if ($found_word) { 1155 push(@substrings_new, $substrings->[$i]); 1156 } 1157 next; 1158 } 1159 $last_idx = 0; 1160 my $i0 = 0; 1161 my $i1 = 0; 1162 while ($substrings->[$i]->[1] =~ m/$word/g) { 1163 if (!$found_word) { 1164 print $debug_msg if $debug_msg; 1165 $found_word = 1; 1166 if ($i != 0) { 1167 push(@substrings_new, @{$substrings}[0..$i-1]); 1168 } 1169 } 1170 # This mess is needed to reliably match $beforeword and $afterword and put the captured 1171 # "splitting" characters back into the text. This would be much easier just using 1172 # a lookbehind and lookahead, but I couldn't find a way to also match beginning and 1173 # end of string that way. 1174 $i0 = $-[0]; 1175 $i1 = $+[0]; 1176 if ($fullword) { 1177 $i0 += length($1); 1178 $i1 -= length($2); 1179 # pos need to be decreased so that matches still work right next to each other 1180 pos($substrings->[$i]->[1]) -= length($2); 1181 } elsif ($beginword) { 1182 $i0 += length($1); 1183 } elsif ($endword) { 1184 $i1 -= length($1); 1185 pos($substrings->[$i]->[1]) -= length($1); 1186 } 1187 if ($last_idx != $i0) { 1188 my $unknown = substr($substrings->[$i]->[1], $last_idx, $i0-$last_idx); 1189 push_unknown \@substrings_new, $unknown; 1190 } 1191 my $orig_str = substr($substrings->[$i]->[1], $i0, $i1-$i0); 1192 my $replace_str = $replace_word // $orig_str; 1193 if ($replace_config->{"options"}->{"nofinal"}) { 1194 warn "Replaced (nofinal) \"$orig_str\" with \"$replace_str\"\n" if $debug_msg; 1195 push_unknown \@substrings_new, $orig_str, $replace_str; 1196 } else { 1197 warn "Replaced \"$orig_str\" with \"$replace_str\"\n" if $debug_msg; 1198 push(@substrings_new, [1, $replace_str, $orig_str]); 1199 } 1200 $last_idx = $i1; 1201 } 1202 if ($last_idx < length($substrings->[$i]->[1]) && $found_word) { 1203 my $unknown = substr($substrings->[$i]->[1], $last_idx); 1204 push_unknown \@substrings_new, $unknown; 1205 } 1206 } 1207 if ($found_word) { 1208 @$substrings = @substrings_new; 1209 } 1210 } 1211 1212 # Replace a group, i.e. replace all the words in a trie 1213 # $replace_config->{"words"} - the root node of the trie 1214 # $replace_config->{"beginword"}, $replace_config->{"endword"} - 1215 # same as in `replace_match` 1216 sub replace_group { 1217 my ($config, $replace_config, $substrings, $debug_msg) = @_; 1218 my @substrings_new; 1219 my $word_found = 0; 1220 # Recurse backwords towards the root node of the trie to find the first 1221 # node with a key "final" which satisfies the ending condition (if "endword" is set) 1222 # Returns the id *after* the last match and the node that was found 1223 # with the key "final" (or undef, if nothing matched) 1224 my $find_final = sub { 1225 my ($i, $tmp_cur_node, $s) = @_; 1226 do { 1227 my $after_ch = substr($s->[1], $i, 1); 1228 if (exists $tmp_cur_node->{"final"} && (!exists($replace_config->{"options"}->{"endword"}) || 1229 $after_ch eq "" || $after_ch =~ $config->{"afterword"})) { 1230 return ($i, $tmp_cur_node); 1231 } 1232 $i--; 1233 } while ($tmp_cur_node = $tmp_cur_node->{"parent"}); 1234 # none of the points were appropriate for breaking the word, so 1235 # $tmp_cur_node now points to the nonexistent parent node of the 1236 # root node 1237 return ($i, undef); 1238 }; 1239 foreach my $s (@$substrings) { 1240 if ($s->[0]) { 1241 push(@substrings_new, $s); 1242 next; 1243 } 1244 my $cur_node = $replace_config->{"words"}; 1245 my $start_i = 0; 1246 my $i = 0; 1247 # This deliberately goes off the end of the string! $cur_node is always "one behind" $i 1248 # since the node is only advanced in the iteration *after* $i has increased, meaning that 1249 # $i has to already be after the end of the string for the first if statement to definitely 1250 # fail, causing the elsif statement to handle that case 1251 while ($i <= length($s->[1])) { 1252 # This works even when $i is one index after the end of the string - it just returns "" then 1253 my $ch = substr($s->[1], $i, 1); 1254 if (exists $cur_node->{$ch}) { 1255 if ($cur_node == $replace_config->{"words"}) { 1256 my $before_ch = $i > 0 ? substr($s->[1], $i - 1, 1) : ""; 1257 if (exists($replace_config->{"options"}->{"beginword"}) && 1258 $before_ch ne "" && $before_ch !~ $config->{"beforeword"}) { 1259 push_unknown \@substrings_new, $ch; 1260 $i++; 1261 next; 1262 } 1263 $start_i = $i; 1264 } 1265 $cur_node = $cur_node->{$ch}; 1266 } elsif (exists $cur_node->{"final"} || $cur_node != $replace_config->{"words"} || $i == length($s->[1])-1) { 1267 my $tmp_cur_node = $cur_node; 1268 ($i, $tmp_cur_node) = $find_final->($i, $tmp_cur_node, $s); 1269 if (!defined($tmp_cur_node)) { 1270 push_unknown \@substrings_new, substr($s->[1], $i + 1, 1); 1271 $i += 2; 1272 } else { 1273 my $orig = substr($s->[1], $start_i, $i-$start_i); 1274 my $final = $tmp_cur_node->{"final"}; 1275 if ($debug_msg) { 1276 if (!$word_found) { 1277 warn $debug_msg; 1278 $word_found = 1; 1279 } 1280 warn "Replaced \"$orig\" with \"$final\"\n"; 1281 } 1282 push(@substrings_new, [1, $final, $orig]); 1283 } 1284 $cur_node = $replace_config->{"words"}; 1285 next; 1286 } else { 1287 push_unknown \@substrings_new, $ch; 1288 } 1289 $i++; 1290 } 1291 } 1292 @$substrings = @substrings_new; 1293 } 1294 1295 # Perform all replacements on $word, first removing all 1296 # characters specified in $chars 1297 sub replace_strip_chars { 1298 my ($config, $args, $chars, $word) = @_; 1299 foreach my $char (@$chars) { 1300 $word =~ s/\Q$char\E//g; 1301 } 1302 return replace_line($config, $args, $word); 1303 } 1304 1305 # Perform all replacements on $line based on $config 1306 # $substrings: array of arrays - each one has three elements: 1307 # first 0 or 1, indicating if the substring following it has already 1308 # been replaced or not (1 means it has been replaced), then the 1309 # transliterated string, and lastly the original string. 1310 # If the first element is 0, the second two elements are obviously same 1311 sub replace_line { 1312 my ($config, $args, $line) = @_; 1313 my $substrings = [[0, $line, $line]]; 1314 foreach my $replacement (@{$config->{"replacements"}}) { 1315 if ($replacement->{"type"} eq "match") { 1316 my $debug_msg; 1317 if ($args->{"debug"}) { 1318 my $options = join " ", keys(%{$replacement->{"options"}}); 1319 $debug_msg = "Match ($options): \"$replacement->{search}\""; 1320 if ($replacement->{"replace"}) { 1321 $debug_msg .= " \"$replacement->{replace}\"\n"; 1322 } else { 1323 $debug_msg .= " (ignore)\n"; 1324 } 1325 } 1326 replace_match($config, $replacement, $substrings, $debug_msg); 1327 } elsif ($replacement->{"type"} eq "group") { 1328 my $debug_msg; 1329 if ($args->{"debug"}) { 1330 my $options = join " ", keys(%{$replacement->{"options"}}); 1331 my $tables = '"' . join('" "', @{$replacement->{"tables"}}) . '"'; 1332 $debug_msg = "Group ($options): $tables\n"; 1333 } 1334 replace_group($config, $replacement, $substrings, $debug_msg); 1335 } 1336 } 1337 # splits all words at the end so that the splitting characters 1338 # aren't taken as unknown words and the unknown words are (hopefully) 1339 # in better chunks for prompting the user about them 1340 split_words($config, $substrings); 1341 1342 return $substrings; 1343 } 1344 1345 # Call the unknown word window with the given substrings and index 1346 # See `get_unknown_words` for explanation of other parameters 1347 # (should be obvious) 1348 # Returns: 1349 # 3, if the rest of the line should be skipped 1350 # 1, if the line needs to be re-transliterated 1351 # 0, if nothing needs to be done 1352 sub call_unknown_word_window { 1353 my ($substrings, $i, $config, $args, $cur_lineno) = @_; 1354 my $word = $substrings->[$i]; 1355 my $contextl = ""; 1356 my $contextl_orig = ""; 1357 foreach my $j (0..$i-1) { 1358 $contextl .= $substrings->[$j]->[1]; 1359 $contextl_orig .= $substrings->[$j]->[2]; 1360 } 1361 my $contextr = ""; 1362 my $contextr_orig = ""; 1363 foreach my $j ($i+1..$#$substrings) { 1364 $contextr .= $substrings->[$j]->[1]; 1365 $contextr_orig .= $substrings->[$j]->[2]; 1366 } 1367 my $action = prompt_unknown_word($contextl, $contextl_orig, 1368 $word->[1], $word->[2], $contextr, $contextr_orig, 1369 $config, "$cur_lineno", $args); 1370 # if $ret == 3, rest of line should be skipped 1371 # if $ret == 2, config could not be loaded 1372 # if $ret == 1, line must be redone with new config 1373 my $ret = handle_unknown_word_action($action, $config, $args); 1374 # keep retrying until the user chooses an action which 1375 # didn't throw an error 1376 while ($ret == 2) { 1377 $action = prompt_unknown_word($contextl, $contextl_orig, 1378 $word->[1], $word->[2], $contextr, $contextr_orig, 1379 $config, "$cur_lineno", $args, 1); 1380 $ret = handle_unknown_word_action($action, $config, $args); 1381 } 1382 return $ret; 1383 } 1384 1385 # NOTE: MUST ALWAYS ADD REPLACEMENT WORDS FIRST! 1386 # If an ignore word is added which is attached to a word that should have a replacement 1387 # added and just that word is selected to ignore, you never get a chance to add a 1388 # replacement for the other word that it is attached to 1389 1390 # NOTE: This is very ugly code. The GUI code is the worst, but this whole part 1391 # of the program is nasty. This is partially due to the fact that features kept 1392 # being added when their use was discovered. This problem might be fixed in the 1393 # future when I have time to rewrite all of this. 1394 1395 # Handle unknown words 1396 # $substrings - the current substrings with unknown words 1397 # $config - the program config 1398 # $args - the command line args 1399 # $cur_lineno - display string to show the user the current line number 1400 # Returns: 1401 # 1 - the line needs to be re-transliterated 1402 # 0 - all done 1403 sub get_unknown_words { 1404 my ($substrings, $config, $args, $cur_lineno) = @_; 1405 foreach my $i (0 .. $#$substrings) { 1406 my $word = $substrings->[$i]; 1407 if (!$word->[0] && !exists($config->{"ignore_words"}->{$word->[1]})) { 1408 my $ret = call_unknown_word_window $substrings, $i, $config, $args, $cur_lineno; 1409 # 3 means we ignore the line 1410 if ($ret == 3) { 1411 foreach my $s (@$substrings) { 1412 # revert all changes done on the line 1413 $s->[1] = $s->[2]; 1414 } 1415 return 0; 1416 } 1417 # 1 means the line needs to be re-transliterated 1418 return 1 if $ret == 1; 1419 } 1420 $i++; 1421 } 1422 return 0; 1423 } 1424 1425 # Main replacement function 1426 # Opens the input file ($args->{"input"}) and writes the transliterated text 1427 # to the file handle $outputfh, prompting the user for unknown words or 1428 # word choices (if those aren't disabled on the command line) 1429 sub replace { 1430 my ($config, $args, $total_lines, $inputfh, $outputfh) = @_; 1431 while (my $line = <$inputfh>) { 1432 next if $. < $args->{"start"}; 1433 my $comment; 1434 if (exists $config->{"comment"}) { 1435 $line =~ s/\Q$config->{comment}\E(.*)\z//s; 1436 $comment = $1; 1437 } 1438 my $nfd_line = NFD($line); 1439 my $substrings = replace_line($config, $args, $nfd_line); 1440 1441 if (!$args->{"nounknowns"}) { 1442 # re-transliterate the string if the config was reloaded 1443 while (get_unknown_words($substrings, $config, $args, "$./$total_lines")) { 1444 $substrings = replace_line($config, $args, $nfd_line); 1445 } 1446 } elsif ($args->{"debugspecial"}) { 1447 foreach my $s (@$substrings) { 1448 if (!$s->[0] && !exists($config->{"ignore_words"}->{$s->[1]})) { 1449 warn "Unknown word: \"$s->[1]\"\n"; 1450 } 1451 } 1452 } 1453 if (!$args->{"nochoices"}) { 1454 # this only loops more than once if the user presses the button 1455 # "Open in unknown word window" 1456 while (my $ret = prompt_choose_word($substrings, $config, $args, "$./$total_lines")) { 1457 if ($ret == 1) { 1458 $substrings = replace_line($config, $args, $nfd_line); 1459 } 1460 } 1461 } elsif ($args->{"debugspecial"}) { 1462 foreach my $s (@$substrings) { 1463 if ($s->[0] && $s->[1] =~ /\Q$config->{choicesep}\E/) { 1464 my $num_choices = split /\Q$config->{choicesep}\E/, $s->[1]; 1465 warn "Word \"$s->[1]\" with $num_choices word choices.\n"; 1466 } 1467 } 1468 } 1469 1470 foreach (@$substrings) { 1471 print $outputfh $_->[1]; 1472 } 1473 print $outputfh $comment if $comment; 1474 } 1475 } 1476 1477 my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => ""); 1478 GetOptions( 1479 \%args, "debug", "debugspecial", 1480 "nochoices", "nounknowns", 1481 "force", "start=i", 1482 "output=s", "config=s", 1483 "errors=s", "help", 1484 "checkduplicates", "dumptables") or pod2usage(1); 1485 1486 pod2usage(-exitval => 0, -verbose => 2) if $args{"help"}; 1487 pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1; 1488 1489 if (!-f $args{"config"}) { 1490 die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n"; 1491 } 1492 my $config = load_config \%args; 1493 if (!$config) { 1494 die "ERROR: Invalid config\n"; 1495 } 1496 exit 0 if ($args{"checkduplicates"} || $args{"dumptables"}); 1497 1498 my $inputfh; 1499 my $total_lines = "UNKNOWN"; 1500 if (@ARGV < 1) { 1501 warn "WARNING: no input file supplied; taking input from STDIN\n"; 1502 $inputfh = \*STDIN; 1503 } else { 1504 open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n"; 1505 # Is there *really* no more efficient way to get the total number of lines? 1506 $total_lines = 0; 1507 while (<$inputfh>) {$total_lines++}; 1508 close $inputfh; 1509 open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n"; 1510 } 1511 1512 if (-f $args{"errors"} && !$args{"force"}) { 1513 my $choice = ""; 1514 while ($choice !~ /^[yn]$/) { 1515 print STDERR "\"$args{errors}\" already exists. Do you want to overwrite it? "; 1516 $choice = <STDIN>; 1517 chomp $choice; 1518 } 1519 die "ERROR: \"$args{errors}\" already exists.\n" if $choice ne "y"; 1520 } 1521 1522 my $outputfh; 1523 if ($args{"output"} eq "") { 1524 warn "WARNING: no output file supplied; printing to STDOUT\n"; 1525 $outputfh = \*STDOUT; 1526 } elsif (-f $args{"output"} && !$args{"force"}) { 1527 my $choice = ""; 1528 while ($choice !~ /^[aoe]$/) { 1529 print STDERR "\"$args{output}\" already exists. (a)ppend, (o)verwrite, or (e)xit? "; 1530 $choice = <STDIN>; 1531 chomp $choice; 1532 } 1533 if ($choice eq "a") { 1534 open $outputfh, ">>", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; 1535 } elsif ($choice eq "e") { 1536 die "ERROR: \"$args{output}\" already exists.\n"; 1537 } else { 1538 open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; 1539 } 1540 } else { 1541 open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; 1542 } 1543 1544 replace($config, \%args, $total_lines, $inputfh, $outputfh); 1545 close $inputfh; 1546 close $outputfh; 1547 1548 __END__ 1549 1550 =head1 NAME 1551 1552 transliterate.pl - Transliterate text files 1553 1554 =head1 SYNOPSIS 1555 1556 transliterate.pl [options][input file] 1557 1558 Start the transliteration engine with the given file as input. 1559 The input file defaults to STDIN if no filename is given. 1560 1561 =head1 OPTIONS 1562 1563 =over 8 1564 1565 =item B<--output> <filename> 1566 1567 Sets the output file to print to. 1568 1569 If the file exists already and B<--force> is not set, the user is asked 1570 if the file should be overwritten or appended to. 1571 1572 B<Default:> C<STDOUT> (print to terminal) 1573 1574 =item B<--config> <filename> 1575 1576 Sets the configuration file to use. 1577 1578 B<Default:> C<config> 1579 1580 =item B<--checkduplicates> 1581 1582 Prints all duplicate words within single table files and across tables 1583 that are replaced within the same group, then exits the program. 1584 1585 Note that this simply prints B<all> duplicates, even ones that are 1586 legitimate. When duplicates are found during normal operation of 1587 the program, they are simply combined in exactly the same way as the 1588 regular word choices. 1589 1590 Also note that the words are still added as possible choices, which 1591 may be slightly confusing. If, for instance, a word "word" is stored 1592 in the tables "tablea", "tableb", and "tablec" with the replacements 1593 "a", "b", and "c", the first duplicate message will say that the 1594 first occurrence was in table "tablea" with the replacement "a", and 1595 the second duplicate message will say that the first occurrence was 1596 in table "tablea" with the replacement "a$b" (assuming $ is the 1597 value set as B<choicesep> in the config). This is just something to 1598 be aware of. 1599 1600 On that note, before duplicates are checked between tables in the 1601 same replacement group, duplicates inside the same file are already 1602 replaced, so that might be a bit confusing as well. 1603 1604 =item B<--dumptables> 1605 1606 Prints the words of all tables that don't have B<nodisplay> set. 1607 1608 This is mainly meant to be used for generating word lists in order to 1609 use them in a spell checker. Note that the words printed here are in 1610 UTF-8 NFC (Unicode Canonical Composition Form), so it may not be ideal 1611 when the spellchecked text is not in the same form. 1612 1613 =item B<--nochoices> 1614 1615 Disables prompting for the right word when multiple replacement words exist. 1616 1617 This can be used to "weed out" all the unknown words before 1618 commencing the laborious task of choosing the right word every time 1619 multiple options exist. 1620 1621 =item B<--nounknowns> 1622 1623 Disables prompting for the right word when a word is not found in the database. 1624 1625 This can be used together with B<--nochoices> to perform a quick test of how 1626 well the actual engine is working without having to click through all the 1627 prompts. 1628 1629 =item B<--debug> 1630 1631 Prints information helpful for debugging problems with the B<match> and B<group> 1632 statements. 1633 1634 For each B<match> or B<group> statement which replaces anything, the original 1635 statement is printed (the format is a bit different than in the config) and 1636 each actual word that's replaced is printed. 1637 1638 =item B<--debugspecial> 1639 1640 This option is only useful for automatic testing of the transliteration engine. 1641 1642 If B<--nochoices> is enabled, each word in the input with multiple choices will 1643 be output, along with the number of choices (can be used to test the proper 1644 functioning of B<choicesep> in the config file). 1645 1646 If B<--nounknowns> is enabled, each unknown word in the input is printed 1647 (can be used to test that the B<ignore> options are working correctly). 1648 1649 =item B<--force> 1650 1651 Always overwrites the output and error file without asking. 1652 1653 =item B<--start> <line number> 1654 1655 Starts at the given line number instead of the beginning of the file. 1656 1657 Note: when "Stop processing" is pressed, the current line number is 1658 printed out. This is the current line that was being processed, so it 1659 has not been printed to the output file yet and thus the program must 1660 be resumed at that line, not the one afterwards. 1661 1662 =item B<--errors> <filename> 1663 1664 Specifies a file to write errors in. Note that this does not refer to 1665 actual errors, but to any words that were temporarily ignored 1666 (i.e. words for which "Ignore: This run" was clicked). 1667 1668 If no file is specified, nothing is written. If a file is specified 1669 that already exists and B<--force> is not set, the user is prompted 1670 for action. 1671 1672 =item B<--help> 1673 1674 Displays the full documentation. 1675 1676 =back 1677 1678 =head1 DESCRIPTION 1679 1680 B<transliterate.pl> will read the given input file and transliterate it 1681 based on the given configuration file, prompting the user for action if 1682 a word has multiple replacement options or is not found in the database. 1683 1684 See L</"CONFIGURATION"> for details on what is possible. 1685 1686 Note that this is B<not> some sort of advanced transliteration engine 1687 which understands the grammar of the language and tries to guess words 1688 based on that. This is only a glorified find-and-replace program 1689 with some extra features to make it useful for transliterating text 1690 using large wordlists. 1691 1692 WARNING: All input data is assumed to be UTF-8! 1693 1694 =head1 WORD CHOICE WINDOW 1695 1696 The word choice window is opened any time one word has multiple replacement 1697 options and prompts the user to choose one. 1698 1699 For each word with multiple options, the user must choose the right option 1700 and then press "Accept changes" to finalize the transliteration of the 1701 current line. The button to accept changes is selected by default, so it 1702 is possible to just press enter instead of manually clicking it. Before the 1703 line is finalized, the user may press "Undo" to undo any changes on the 1704 current line. 1705 1706 "Skip word" just leaves it as is. This shouldn't be needed in most cases 1707 since B<choicesep> should always be set to a character that doesn't occur 1708 normally in the text anyways. 1709 1710 "Open in unknown word window" will open the 1711 L<unknown word window|/"UNKNOWN WORD WINDOW"> with the current word 1712 selected. This is meant as a helper if you notice that another word choice 1713 needs to be added. 1714 1715 Warning: This is very inconsistent and buggy! Since the unknown word window 1716 is just opened directly, it isn't modified to make more sense for this 1717 situation. Whenever "Add replacement" is pressed, the whole line is 1718 re-transliterated as usual, but the word choice window is opened again 1719 right afterwards. If you just want to go back to the word choice window, 1720 press the ignore button for "whole line" since that shouldn't break 1721 anything. There are weird inconsistencies, though - for instance, if you 1722 delete all words in the tables, then press "Reload config", the line will 1723 be re-transliterated and none of the words will actually be found, but it 1724 will still go on because control passes back to the word choice window no 1725 matter what. Also, none of the word choices that were already done on this 1726 line are saved since the line is restarted from the beginning. As I said, 1727 it's only there as a helper and is very buggy/inconsistent. Maybe I'll make 1728 everything work better in a future release. 1729 1730 "Stop processing" will exit the program and print the line number that was 1731 currently being processed. 1732 1733 =head1 UNKNOWN WORD WINDOW 1734 1735 The unknown word window is opened any time a word could not be replaced. 1736 1737 Both the context from the original script and the context from the 1738 transliterated version (so far) is shown. If a part of the text is 1739 selected in one of the text boxes and "Use selection as word" is 1740 pressed for the appropriate box, the selected text is used for the 1741 action that is taken subsequently. "Reset text" resets the text in 1742 the text box to its original state (except for the highlight because 1743 I'm too lazy to do that). 1744 1745 The possible actions are: 1746 1747 =over 8 1748 1749 =item Ignore 1750 1751 "This run" only ignores the word until the program exits, while 1752 "Permanently" saves the word in the ignore file specified in the 1753 configuration. "Whole line" stops asking for unknown words on 1754 this line and prints the line out as it originally was in the 1755 file. Note that any words in the original line that contain 1756 B<choicesep> will still cause the L<word choice window|/"WORD CHOICE WINDOW"> 1757 to appear due to the way it is implemented. Just press "Skip word" 1758 if that happens. 1759 1760 =item Retry without <display name> 1761 1762 Removes all characters specified in the corresponding B<retrywithout> 1763 statement in the L<config|/"CONFIGURATION"> 1764 from the currently selected word and re-transliterates just that 1765 word. The result is then pasted into the text box beside 1766 "Add replacement" so it can be added to a table. This is only a 1767 sort of helper for languages like Urdu in which words often can 1768 be written with or without diacritics. If the "base form" without 1769 diacritics is already in the tables, this button can be used to 1770 quickly find the transliteration instead of having to type it 1771 out again. Any part of the word that couldn't be transliterated 1772 is just pasted verbatim into the text box (but after the 1773 characters have been removed). 1774 1775 Note that the selection can still be modified after this, before 1776 pressing "Add to list". This could potentially be useful if a word 1777 is in a table that is expanded using "noroot" because for instance 1778 "Retry without diacritics" would only work with the full word (with 1779 the ending), but only the stem should be added to the list. If that 1780 is the case, "Retry without diacritics" could be pressed with the 1781 whole word selected, but the ending could be removed before actually 1782 pressing "Add to list". 1783 1784 A separate button is shown for every B<retrywithout> statement 1785 in the config. 1786 1787 =item Add to list 1788 1789 Adds the word typed in the text box beside "Add replacement" to the 1790 selected table file as the replacement for the word currently selected 1791 and re-runs the replacement on the current line. All table files that 1792 do not have B<nodisplay> set are shown as options, see L</"CONFIGURATION">. 1793 1794 Warning: This simply appends the word and its replacement to the end 1795 of the file, so it will cause an error if there was no newline 1796 ("\n") at the end of the file before. 1797 1798 Note that this always re-transliterates the entire line afterwards. 1799 This is to allow more flexibility. Consider, for instance, a compound 1800 word of which the first part is also a valid single word. If the 1801 entire line was not re-transliterated, it would be impossible to 1802 add a replacement for that entire compound word and have it take 1803 effect during the same run since the first part of the word would 1804 not even be available for transliteration anymore. 1805 1806 One problem is that the word is just written directly to the file 1807 and there is no undo. This is the way it currently is and will 1808 probably not change very soon. If a mistake is made, the word can 1809 always be removed again manually from the list and "Reload config" 1810 pressed. 1811 1812 =item Reload config 1813 1814 Reload the configuration file along with all tables an re-runs the 1815 replacement on the current line. Note that this can take a short 1816 while since the entire word database has to be reloaded. 1817 1818 =item Stop processing 1819 1820 Prints the current line number to the terminal and exits the program. 1821 1822 The program can always be started again at this line number using 1823 the B<--start> option if needed. 1824 1825 =back 1826 1827 =head1 INTERNALS/EXAMPLES 1828 1829 This section was added to explain to the user how the transliteration 1830 process works internally since that may be necessary to understand 1831 why certain words are replaced the way they are. 1832 1833 First off, the process works line-by-line, i.e. no B<match> or B<replace> 1834 statement will ever match anything that crosses the end of a line. 1835 1836 Each line is initially stored as one chunk which is marked as 1837 untransliterated. Then, all B<match>, B<matchignore>, and B<replace> 1838 (or, rather, B<group>) statements are executed in the order they 1839 appear in the config file. Whenever a word/match is replaced, it 1840 is split off into a separate chunk which is marked as transliterated. 1841 A chunk marked as transliterated I<is entirely ignored by any 1842 replacement statements that come afterwards>. Note that B<beginword> 1843 and B<endword> can always match at the boundary between an 1844 untransliterated and transliterated chunk. This is to facilitate 1845 automated replacement of certain grammatical constructions. For instance: 1846 1847 If the string "a-" could be attached as a prefix to any word and needed 1848 to be replaced as "b-" everywhere, it would be quite trivial to add 1849 a match statement C<'match "a-" "b-" beginword'>. If run on the text 1850 "a-word", where "word" is some word that should be transliterated 1851 as "word_replaced", and the group replace statement for the word comes 1852 after the match statement given above, the following would happen: 1853 First, the match statement would replace "a-" and split the text into 1854 the two chunks "b-" and "word", where "b-" is already marked as 1855 transliterated. Since "word" is now separate, it will be matched 1856 by the group replace statement later, even if it has B<beginword> set 1857 and would normally not match if "a-" came before it. Thus, the final 1858 output will be "b-word_replaced", allowing for the uniform replacement 1859 of the prefix instead of having to add each word twice, once with and 1860 once without the prefix. 1861 1862 In certain cases, this behavior may not be desired. Consider, for 1863 instance, a prefix "c-" which cannot be replaced uniformly as in the 1864 example above due to differences in the source and destination script. 1865 Since it cannot be replaced uniformly, two words "word1" and "word2" 1866 would both need to be specified separately with replacements for 1867 "c-word1" and "c-word2". If, however, the prefix "c-" has an 1868 alternate spelling "c " (without the hyphen), it would be very useful 1869 to be able to automatically recognize that as well. This is where the 1870 B<nofinal> attribute for the B<match> statements comes in. If there is 1871 a match statement C<'match "c " "c-" beginword nofinal'>, the replaced 1872 chunk is B<not> marked as transliterated, so after executing this 1873 statement on the text "c word1", there will still only be one chunk, 1874 "c-word1", allowing for the regular word replacements to function 1875 properly. 1876 1877 Once all the replacement statements have been processed, each chunk 1878 of text that is not marked as transliterated yet is split based on 1879 the B<split> pattern specified in the config and all actual characters 1880 matched by the B<split> pattern are marked as transliterated (this 1881 usually means all the spaces, newlines, quotation marks, etc.). Any 1882 remaining words/text chunks that are still marked as untransliterated are 1883 now processed by the unknown word window. If one of these remaining 1884 unknown chunks is present in the file specified by the B<ignore> 1885 statement in the config, it is simply ignored and later printed out 1886 as is. After all untransliterated words have either had a replacement 1887 added or been ignored, any words with multiple replacement choices are 1888 processed by the word choice window. Once this is all done, the final 1889 output is written to the output file and the process is repeated with 1890 the next line. Note that the entire process is started again each time 1891 a word is added to a table or the config is reloaded from the 1892 L<unknown word window|/"UNKNOWN WORD WINDOW">. 1893 1894 =head1 CONFIGURATION 1895 1896 These are the commands accepted in the configuration file. 1897 Any parameters in square brackets are optional. 1898 Comments are started with C<#>. Strings (filenames, regex strings, etc.) 1899 are enclosed in double quotes (""). 1900 1901 The B<match>, B<matchignore>, and B<replace> commands are executed in 1902 the order they are specified, except that all B<replace> commands within 1903 the same group are replaced together. 1904 1905 The B<match> and B<matchignore> statements accept any RegEx strings and 1906 are thus very powerful. The B<group> statements only work with the 1907 non-RegEx words from the tables, but are very efficient for large numbers 1908 of words and should thus be used for the main bulk of the words. 1909 1910 Any duplicate words found will cause the user to be prompted to choose 1911 one option every time the word is replaced in the input text. 1912 1913 Note that any regex strings specified in the config should B<not> 1914 contain capture groups, as that would break the B<endword> functionality 1915 since this is also implemented internally using capture groups. Capture 1916 groups are also entirely pointless in the config since they currently 1917 cannot be used as part of the replacement string in B<match> statements. 1918 Lookaheads and lookbehinds are fine, though, and could be useful in 1919 certain cases. 1920 1921 All tables must be loaded before they are used, or there will be an 1922 error that the table does not exist. 1923 1924 Warning: If a B<replace> statement is located before an B<expand> 1925 statement that would have impacted the table used, there will be no 1926 error but the expand statement won't have any impact. 1927 1928 Basic rule of thumb: Always put the B<table> statements before the 1929 B<expand> statements and the B<expand> statements before the B<replace> 1930 statements. 1931 1932 =over 8 1933 1934 =item B<split> <regex string> 1935 1936 Sets the RegEx string to be used for splitting words. This is only used 1937 for splitting the words which couldn't be replaced after all replacement 1938 has been done, before prompting the user for unknown words. 1939 1940 Note that B<split> should probably always contain at least C<\n>, since 1941 otherwise all of the newlines will be marked as unknown words. Usually, 1942 this will be included anyways through C<\s>. 1943 1944 Note also that B<split> should probably include the C<+> RegEx-quantifier 1945 since that allows the splitting function in the end to ignore several 1946 splitting characters right after each other (e.g. several spaces) in one 1947 go instead of splitting the string again for every single one of them. 1948 This shouldn't actually make any difference functionality-wise, though. 1949 1950 B<Default:> C<\s+> (all whitespace) 1951 1952 =item B<beforeword> <regex string> 1953 1954 Sets the RegEx string to be matched before a word if B<beginword> is set. 1955 1956 B<Default:> C<\s> 1957 1958 =item B<afterword> <regex string> 1959 1960 Sets the RegEx string to be matched after a word if B<endword> is set. 1961 1962 Note that B<afterword> should probably always contain at least C<\n>, 1963 since otherwise words with B<endword> set will not be matched at the 1964 end of a line. 1965 1966 B<beforeword> and B<afterword> will often be exactly the same, but 1967 they are left as separate options in case more fine-tuning is needed. 1968 1969 B<Default:> C<\s> 1970 1971 =item B<tablesep> <string> 1972 1973 Sets the separator used to split the lines in the table files into the 1974 original and replacement word. 1975 1976 B<Default:> C<Tab> 1977 1978 =item B<choicesep> <string> 1979 1980 Sets the separator used to split replacement words into multiple choices for 1981 prompting the user. 1982 1983 B<Default:> C<$> 1984 1985 =item B<comment> <string> 1986 1987 If enabled, anything after C<< <string> >> will be ignored on all lines in 1988 the input file. This will not be displayed in the 1989 L<unknown word window|/"UNKNOWN WORD WINDOW"> or L<word choice window|/"WORD CHOICE WINDOW"> 1990 but will still be printed in the end, with the comment character removed 1991 (that seems to be the most sensible thing to do). 1992 1993 Note that this is really just a "dumb replacement", so there's no way to 1994 prevent a line with the comment character from being ignored. Just try 1995 to always set this to a character that does not occur anywhere in the text 1996 (or don't use the option at all). 1997 1998 =item B<ignore> <filename> 1999 2000 Sets the file of words to ignore. 2001 2002 This has to be set even if the file is just empty because the user can 2003 add words to it from the unknown word window. 2004 2005 =item B<table> <table identifier> <filename> [nodisplay] [revert] 2006 2007 Load the table from C<< <filename> >>, making it available for later use in the 2008 B<expand> and B<replace> commands using the identifier C<< <table identifier> >>. 2009 2010 if B<nodisplay> is set, the filename for this table is not shown in the 2011 L<unknown word window|/"UNKNOWN WORD WINDOW">. If, however, the same filename 2012 is loaded again for another table that does not have B<nodisplay> set, it is 2013 still displayed. 2014 2015 If B<revert> is set, the original and replacement words are switched. This can 2016 be useful for creating a config for transliterating in the opposite direction 2017 with the same database. I don't know why I called it "revert" since it should 2018 actually be called "reverse". I guess I was a bit confused. 2019 2020 Note that if C<< <filename> >> is not an absolute path, it is taken to be relative 2021 to the location of the configuration file. 2022 2023 The table files simply consist of B<tablesep>-separated values, with the word in the 2024 original script first and the replacement word second. Both the original and 2025 replacement word can optionally have several parts separated by B<choicesep>. If the 2026 original word has multiple parts, it is separated and each of the parts is added 2027 to the table with the replacement. If the replacement has multiple parts, the user 2028 will be prompted to choose one of the options during the transliteration process. 2029 If the same word occurs multiple times in the same table with different replacements, 2030 the replacements are automatically added as choices that will be handled by the 2031 L<word choice window|/"WORD CHOICE WINDOW">. 2032 2033 If, for whatever reason, the same table is needed twice, but with different endings, 2034 the table can simply be loaded twice with different IDs. If the same path is loaded, 2035 the table that has already been loaded will be reused. Note that this feature was 2036 added before adding B<revert>, so the old table is used even if it had B<revert> 2037 set and the new one doesn't. This is technically a problem, but I don't know of 2038 any real-world case where it would be a problem, so I'm too lazy to change it. 2039 Tell me if it actually becomes a problem for you. 2040 2041 WARNING: Don't load the same table file both with and without B<revert> in the same 2042 config! When a replacement word is added through the GUI, the program has to know 2043 which way to write the words. Currently, whenever a table file is loaded with 2044 B<revert> anywhere in the config (even if it is loaded without B<revert> in a 2045 different place), words will automatically be written as if B<revert> was on. I 2046 cannot currently think of any reason why someone would want to load a file both 2047 with and without B<revert> in the same config, but I still wanted to add this 2048 warning just in case. 2049 2050 =item B<expand> <table identifier> <word ending table> [noroot] 2051 2052 Expand the table C<< <table identifier> >>, i.e. generate all the word forms using 2053 the word endings in C<< <word ending table> >>, saving the result as a table with the 2054 identifier C<< <new table identifier> >>. 2055 2056 Note: There used to be a C<< <new table identifier> >> argument to create a new 2057 table in case one table had to be expanded with different endings. This has been 2058 removed because it was a bit ugly, especially since there wasn't a proper mapping 2059 from table IDs to filenames anymore. If this functionality is needed, the same table 2060 file can simply be loaded multiple times. See the B<table> section above. 2061 2062 If B<noroot> is set, the root forms of the words are not kept. 2063 2064 If the replacement for a word ending contains B<choicesep>, it is split and each part 2065 is combined with the root form separately and the user is prompted to choose one of 2066 the options later. it is thus possible to allow multiple choices for the ending if 2067 there is a distinction in the replacement script but not in the source script. 2068 Note that each of the root words is also split into its choices (if necessary) 2069 during the expanding, so it is possible to use B<choicesep> in both the endings 2070 and root words. 2071 2072 =item B<match> <regex string> <replacement string> [beginword] [endword] [nofinal] 2073 2074 Perform a RegEx match using the given C<< <regex string> >>, replacing it with 2075 C<< <replacement string> >>. Note that the replacement cannot contain any RegEx 2076 (e.g. groups) in it. B<beginword> and B<endword> specify whether the match must 2077 be at the beginning or ending of a word, respectively, using the RegEx specified 2078 in B<beforeword> and B<afterword>. If B<nofinal> is set, the string is not marked 2079 as transliterated after the replacement, allowing it to be modified by subsequent 2080 B<match> or B<replace> commands. 2081 2082 =item B<matchignore> <regex string> [beginword] [endword] 2083 2084 Performs a RegEx match in the same manner as B<match>, except that the original 2085 match is used as the replacement instead of specifying a replacement string, i.e. 2086 whatever is matched is just marked as transliterated without changing it. 2087 2088 =item B<group> [beginword] [endword] 2089 2090 Begins a replacement group. All B<replace> commands must occur between B<group> 2091 and B<endgroup>, since they are then grouped together and replaced in one go. 2092 B<beginword> and B<endword> act in the same way as specified for B<match> and 2093 apply to all B<replace> statements in this group. 2094 2095 =item B<replace> <table identifier> [override] 2096 2097 Replace all words in the table with the identifier C<< <table identifier> >>, 2098 using the B<beginword> and B<endword> settings specified by the current group. 2099 2100 Unless B<override> is set on the latter table, if the same word occurs in two 2101 tables with different replacements, both are automatically added as choices. 2102 See L</"WORD CHOICE WINDOW">. 2103 2104 B<override> can be useful if the same database is used for both directions and 2105 one direction maps multiple words to one word, but in the other direction this 2106 word should always default to one of the choices. In that case, a small table 2107 with these special cases can be created and put at the end of the main B<group> 2108 statement with B<override> set. This is technically redundant since you could 2109 just add a special group with only the override table in it earlier in the 2110 config, but it somehow seems cleaner this way. 2111 2112 Note that a table must have been loaded before being used in a B<replace> statement. 2113 2114 =item B<endgroup> 2115 2116 End a replacement group. 2117 2118 =item B<retrywithout> <display name> [character] [...] 2119 2120 Adds a button to the L<unknown word window|/"UNKNOWN WORD WINDOW"> to retry the 2121 replacements on the selected word, first removing the given characters. 2122 The button is named "<display name>" and located after the "Retry without" label. 2123 Whatever is found with the replacements is pasted into the regular text box for 2124 the "Add replacement" functionality. 2125 2126 This can be used as an aid when, for instance, words can be written with or without 2127 certain diacritics. If the actual word without diacritics is already in the 2128 database and there is a B<retrywithout> statement for all the diacritics, the 2129 button can be used to quickly find the replacement for the word instead of having 2130 to type it out manually. The same goes for compound words that can be written 2131 with or without a space. 2132 2133 It is also possible to specify B<retrywithout> without any characters, which just 2134 adds a button that takes whatever word is selected and retries the replacements 2135 on it. This can be useful if you want to manually edit words and quickly see if 2136 they are found with the edits in place. 2137 2138 Note that all input text is first normalized to the unicode canonical decomposition 2139 form so that diacritics can be removed individually. 2140 2141 Also note that all buttons are currently just dumped in the GUI without any 2142 sort of wrapping, so they'll run off the screen if there are too many. 2143 Tell me if this becomes a problem. I'm just too lazy to change it right now. 2144 2145 Small warning: This only removes the given characters from the word selected in 2146 the GUI, not from the tables. Thus, this only works if the version of the word 2147 without any of the characters is already present in the tables. It would be useful 2148 when handling diacritics if the program could simply make a comparison while 2149 completely ignoring diacritics, but I haven't figured out a nice way to implement 2150 that yet. 2151 2152 Historical note: This was called B<diacritics> in a previous version and only 2153 allowed removal of diacritics. This is exactly the same functionality, just 2154 generalized to allow removal of any characters with different buttons. 2155 2156 =item B<targetdiacritics> <diacritic> [...] 2157 2158 This was only added to simplify transliteration from Hindi to Urdu with the 2159 same database. When this is set, the choices in the 2160 L<word choice window|/"WORD CHOICE WINDOW"> are sorted in descending order 2161 based on the number of diacritics from this list that are matched in each 2162 choice. This is so that when transliterating from Hindi to Urdu, the choice 2163 with the most diacritics is always at the top. 2164 2165 Additionally, if there are I<exactly> two choices for a word and one of 2166 them contains diacritics but the other one doesn't, the one containing 2167 diacritics is automatically taken without ever prompting the user. This 2168 is, admittedly, a very language-specific feature, but I couldn't think of 2169 a simple way of adding it without building it directly into the actual program. 2170 2171 Note that due to the way this is implemented, it will not take any effect 2172 if B<--nochoices> is enabled. 2173 2174 The attentive reader will notice at this point that most of the features 2175 in this program were added specifically for dealing with Urdu and Hindi, 2176 which does appear to make sense, considering that this program was written 2177 specifically for transliterating Urdu to Hindi and vice versa (although 2178 not quite as much vice versa). 2179 2180 =back 2181 2182 =head1 BUGS 2183 2184 Although it may not seem like it, one of the ugliest parts of the program is the 2185 GUI functionality that allows the user to add a replacement word. The problem is 2186 that all information about the B<expand> and B<replace> statements has to be kept 2187 in order to properly handle adding a word to one of the files and simultaneously 2188 adding it to the currently loaded tables I<without reloading the entire config>. 2189 The way it currently works, the replacement word is directly written to the file, 2190 then all B<expand> statements that would have impacted the words from this file 2191 are redone (just for the newly added word) and the resulting words are added to 2192 the appropriate tables (or, technically, the appropriate 'trie'). Since a file 2193 can be mapped to multiple table IDs and a table ID can occur in multiple replace 2194 statements, this is more complicated than it sounds, and thus it is very likely 2195 that there are bugs lurking here somewhere. Do note that "Reload config" will 2196 B<always> reload the entire configuration, so that's safe to do even if the 2197 on-the-fly replacing doesn't work. 2198 2199 In general, I have tested the GUI code much less than the rest since you can't 2200 really test it automatically very well. 2201 2202 The code is generally quite nasty, especially the parts belonging to the GUI. 2203 Don't look at it. 2204 2205 Tell me if you find any bugs. 2206 2207 =head1 SEE ALSO 2208 2209 perlre, perlretut 2210 2211 =head1 LICENSE 2212 2213 Copyright (c) 2019, 2020, 2021 lumidify <nobody[at]lumidify.org> 2214 2215 Permission to use, copy, modify, and/or distribute this software for any 2216 purpose with or without fee is hereby granted, provided that the above 2217 copyright notice and this permission notice appear in all copies. 2218 2219 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 2220 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 2221 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 2222 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 2223 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 2224 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 2225 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 2226 2227 =cut