transliterate

Transliteration engine
git clone git://lumidify.org/transliterate.git
Log | Files | Refs | README | LICENSE

commit 9bece82fb8385f9b72a49e7c2b98a9b4f2006182
parent 32b57ea48795b5d6a406fcf15e3543d47618b666
Author: lumidify <nobody@lumidify.org>
Date:   Wed,  8 Apr 2020 08:00:04 +0200

Add comment option for text files; default to STDIN for input file

Diffstat:
Mtests/test6/config | 1+
Mtests/test6/input.txt | 1+
Mtransliterate.pl | 51++++++++++++++++++++++++++++++++++-----------------
3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/tests/test6/config b/tests/test6/config @@ -1,6 +1,7 @@ split "[ \n]+" beforeword " " afterword "[ \n]" +comment "#" ignore "../data/ignore.txt" table words "../data/words.txt" diff --git a/tests/test6/input.txt b/tests/test6/input.txt @@ -1,3 +1,4 @@ +word1#sfsafafasfs#sdfdsfsfs word1 word2 123word1 word9 123 word4 word20 word01231 word0 diff --git a/transliterate.pl b/transliterate.pl @@ -469,7 +469,7 @@ sub parse_config { $state = 0; push(@commands, []); foreach my $char (split(//, $line)) { - if ($char eq "#") { + if ($char eq "#" && !($state & $IN_STR)) { last; } elsif ($char eq '"') { if ($state & $IN_STR) { @@ -698,6 +698,7 @@ sub interpret_config { "afterword" => [$STRING], "tablesep" => [$STRING], "choicesep" => [$STRING], + "comment" => [$STRING], "group" => [], "endgroup" => [], "retrywithout" => [$STRING, $STRING], @@ -837,7 +838,7 @@ sub interpret_config { } } elsif ($cmd_name eq "split" || $cmd_name eq "beforeword" || $cmd_name eq "afterword" || $cmd_name eq "tablesep" || - $cmd_name eq "choicesep") { + $cmd_name eq "choicesep" || $cmd_name eq "comment") { $config{$cmd_name} = $cmd->[1]->{"value"}; } elsif ($cmd_name eq "ignore") { $config{"ignore"} = $cmd->[1]->{"value"}; @@ -1272,15 +1273,12 @@ sub get_unknown_words { # to the file handle $outputfh, prompting the user for unknown words or # word choices (if those aren't disabled on the command line) sub replace { - my ($config, $args, $outputfh) = @_; - # Is there *really* no more efficient way to get the total number of lines? - open my $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n"; - my $total_lines = 0; - while (<$fh>) {$total_lines++}; - close $fh; - open $fh, "<", $args->{"input"} or die "ERROR: Cannot open input file \"$args->{input}\" for reading.\n"; - while (my $line = <$fh>) { + my ($config, $args, $total_lines, $inputfh, $outputfh) = @_; + while (my $line = <$inputfh>) { next if $. < $args->{"start"}; + if (exists $config->{"comment"}) { + $line =~ s/\Q$config->{comment}\E.*$//; + } my $nfd_line = NFD($line); my $substrings = replace_line($config, $nfd_line); @@ -1311,7 +1309,6 @@ sub replace { print $outputfh $_->[1]; } } - close $fh; } my %args = ("config" => "config", "start" => 1, "errors" => "", "output" => ""); @@ -1324,7 +1321,7 @@ GetOptions( "checkduplicates") or pod2usage(1); pod2usage(-exitval => 0, -verbose => 2) if $args{"help"}; -pod2usage(1) if $#ARGV != 0 && !$args{"checkduplicates"}; +pod2usage(-exitval => 1, -verbose => 1) if @ARGV > 1; if (!-f $args{"config"}) { die "ERROR: config file \"$args{config}\" does not exist or is not a file.\n"; @@ -1335,11 +1332,19 @@ if (!$config) { } exit 0 if ($args{"checkduplicates"}); -my $input = $ARGV[0]; -if (!-f $input) { - die "ERROR: input file \"$input\" does not exist or is not a file.\n"; +my $inputfh; +my $total_lines = "UNKNOWN"; +if (@ARGV < 1) { + warn "WARNING: no input file supplied; taking input from STDIN\n"; + $inputfh = \*STDIN; +} else { + open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n"; + # Is there *really* no more efficient way to get the total number of lines? + $total_lines = 0; + while (<$inputfh>) {$total_lines++}; + close $inputfh; + open $inputfh, "<", $ARGV[0] or die "ERROR: Cannot open input file \"$ARGV[0]\" for reading.\n"; } -$args{"input"} = $input; if (-f $args{"errors"} && !$args{"force"}) { my $choice = ""; @@ -1373,7 +1378,8 @@ if ($args{"output"} eq "") { open $outputfh, ">", $args{"output"} or die "ERROR: cannot open \"$args{output}\" for writing.\n"; } -replace($config, \%args, $outputfh); +replace($config, \%args, $total_lines, $inputfh, $outputfh); +close $inputfh; close $outputfh; __END__ @@ -1387,6 +1393,7 @@ transliterate.pl - Transliterate text files transliterate.pl [options][input file] Start the transliteration engine with the given file as input. +The input file defaults to STDIN if no filename is given. =head1 OPTIONS @@ -1762,6 +1769,16 @@ prompting the user. B<Default:> C<$> +=item B<comment> <string> + +If enabled, anything after C<< <string> >> will be ignored on all lines in +the input file. + +Note that this is really just a "dumb replacement", so there's no way to +prevent a line with the comment character from being replaced. Just try +to always set this to a character that does not occur anywhere in the text +(or don't use the option at all). + =item B<ignore> <filename> Sets the file of words to ignore.