commit 4fbf400a3cf553ee8119f3d27072c235657102c0
parent f35b80dd2c875acfa769b7f1e69346c43e8cdf27
Author: lumidify <nobody@lumidify.org>
Date: Tue, 24 Mar 2020 15:24:16 +0100
Change option parsing; add more documentation
Diffstat:
D | TODO | | | 3 | --- |
M | lumia.pl | | | 394 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- |
2 files changed, 328 insertions(+), 69 deletions(-)
diff --git a/TODO b/TODO
@@ -1,3 +0,0 @@
-ALLOW FORCE ON MV AND CP (don't prompt for overwrite)
-Allow to run command on multiple files but keep checksum (e.g. convmv)
-update command - if file was edited
diff --git a/lumia.pl b/lumia.pl
@@ -1,13 +1,10 @@
#!/usr/bin/env perl
-# FIXME: some way to avoid writing .lumidify* in dirs but still index them? e.g. Code/CMSG
-# FIXME: cksum don't create malformed line if permission denied
-# FIXME: ignore all except for a certain file/folder
-# FIXME: store modified date and checksum filed with changed date
-# FIXME: allow different hash types
-# FIXME: don't write anything if cksum fails (will create malformed line)
-# FIXME: add option to just check dir structure or maybe check if everything exists
-# FIXME: add option to compare cksums of two dirs
+# TODO: some way to avoid writing .lumidify* in dirs but still index them?
+# TODO: store modified date and checksum filed with changed date
+# TODO: add option to just check dir structure or maybe check if everything exists
+# TODO: add option to compare cksums of two dirs
+# TODO: exit status!
use strict;
use warnings;
@@ -16,7 +13,7 @@ use File::Basename qw(basename dirname);
use File::Path qw(remove_tree);
use String::ShellQuote;
use Pod::Usage;
-use Getopt::Std;
+use Getopt::Long;
# the file used to store checksums for files
my $CKSUM_FILE = ".lumidify_archive_cksums";
@@ -27,7 +24,13 @@ my $IGNORE_FILE = ".lumidify_archive_ignore";
# the file containing checksums of $CKSUM_FILE and $DIR_FILE
my $DOUBLE_CKSUM_FILE = ".lumidify_archive_cksums.cksum";
+# uncomment this instead of the lines below to use
+# sha256 instead of cksum as the hash algorithm
+#my $CKSUM_CMD = 'sha256 -q';
+#my $CKSUM_NUMFIELDS = 1;
my $CKSUM_CMD = 'cksum -q';
+my $CKSUM_NUMFIELDS = 2;
+
my %SPECIAL_FILES = (
$CKSUM_FILE => 1,
$DIR_FILE => 1,
@@ -105,13 +108,13 @@ sub make_lumia_iter {
# remove all special lumia files from the given directory
sub clean_files {
- my $dir = shift;
+ my ($dir, $args) = @_;
my $iter = make_file_iter_basic sub {exists $SPECIAL_FILES{basename $_[0]};}, $dir;
while (my $file = $iter->()) {
if (!unlink $file) {
warn "WARNING: Unable to remove file \"$file\"!\n";
} else {
- print "Deleted \"$file\"\n";
+ print "Deleted \"$file\"\n" if !$args->{"q"};
}
}
}
@@ -174,13 +177,13 @@ sub read_cksum_file {
my ($file, $cksums) = @_;
return read_file $file, $cksums, sub {
my $line = shift;
- my @fields = split(/ /, $line, 3);
- if ($#fields != 2) {
+ my @fields = split(/ /, $line, $CKSUM_NUMFIELDS+1);
+ if (@fields != $CKSUM_NUMFIELDS+1) {
warn "WARNING: Malformed line \"$line\" in file \"$file\"\n";
return;
}
- my $cur_cksum = join(" ", @fields[0,1]);
- my $cur_str = $fields[2];
+ my $cur_cksum = join(" ", @fields[0..$CKSUM_NUMFIELDS-1]);
+ my $cur_str = $fields[$CKSUM_NUMFIELDS];
return ($cur_cksum, $cur_str);
};
}
@@ -343,7 +346,7 @@ sub check_new_files {
# add all new files in $top_dir to the checksum files
sub check_add_new_files {
- my $top_dir = shift;
+ my ($top_dir, $args) = @_;
my $changed_dirs = 0;
my $changed_files = 0;
check_new_files $top_dir, sub {
@@ -372,7 +375,7 @@ sub check_add_new_files {
close $fh;
$changed_files = 1;
}
- print "Added \"$fullpath\"\n";
+ print "Added \"$fullpath\"\n" if !$args->{"q"};
}, sub {
if (-f "$_[0]/$DOUBLE_CKSUM_FILE") {
if (!check_cksums $_[0], $DOUBLE_CKSUM_FILE, 1) {
@@ -455,6 +458,8 @@ sub write_cksums {
print $dirs_fh '"' . escape_filename($key) . '"' . "\n";
}
}
+ close $files_fh if defined $files_fh;
+ close $dirs_fh if defined $dirs_fh;
if (@special_files) {
write_special_cksums $dir, @special_files;
}
@@ -480,7 +485,7 @@ sub check_old_files {
# clean up the lumia checksum files, removing any files that aren't present
# on the filesystem anymore
sub remove_old_files {
- my $top_dir = shift;
+ my ($top_dir, $args) = @_;
my $iter = make_lumia_iter $top_dir;
while (my $dir = $iter->()) {
if (!-e $dir) {
@@ -490,7 +495,7 @@ sub remove_old_files {
if (exists $lumia_dirs->{$child}) {
delete $lumia_dirs->{$child};
write_file "$parent/$DIR_FILE", $lumia_dirs;
- print "Removed \"$dir\" from \"$parent/$DIR_FILE\"\n";
+ print "Removed \"$dir\" from \"$parent/$DIR_FILE\"\n" if !$args->{"q"};
write_special_cksums $parent, $DIR_FILE;
}
} else {
@@ -499,7 +504,7 @@ sub remove_old_files {
foreach my $file (keys %$cksums) {
if (!-e "$dir/$file") {
delete $cksums->{$file};
- print "Removed \"$dir/$file\" from \"$dir/$CKSUM_FILE\"\n";
+ print "Removed \"$dir/$file\" from \"$dir/$CKSUM_FILE\"\n" if !$args->{"q"};
$found = 1;
}
}
@@ -552,7 +557,6 @@ sub prompt_overwrite {
return 0;
}
-# FIXME: handle different cases like move_files
# copies the $src files to $dst and updates the checksums in $dst
# $src: list of source paths
# $dst: destination directory or file (in latter case only one src is allowed)
@@ -761,7 +765,7 @@ sub move_files {
# remove a file or directory from the filesystem
sub remove_file_dir {
my ($path, $args) = @_;
- my $options = $args->{"f"} ? "-rf" : "-f";
+ my $options = $args->{"f"} ? "-rf" : "-r";
if (system("rm", $options, "--", $path)) {
return 1;
}
@@ -851,8 +855,9 @@ sub make_dirs {
# extract all special lumia files from $src_dir to $dst_dir, recreating the
# entire directory structure in the process
sub extract {
- my ($src_dir, $dst_dir) = @_;
+ my ($src_dir, $dst_dir, $args) = @_;
my $iter = make_lumia_iter $src_dir;
+ my $options = $args->{"v"} ? "-av" : "-a";
while (my $dir = $iter->()) {
my $final_dir = abs2rel $dir, $src_dir;
my $fulldir = catfile $dst_dir, $final_dir;
@@ -860,7 +865,7 @@ sub extract {
foreach my $file (keys %SPECIAL_FILES) {
my $filepath = catfile $dir, $file;
if (-e $filepath) {
- system("cp", "-aiv", "--", $filepath, catfile($fulldir, $file));
+ system("cp", $options, "--", $filepath, catfile($fulldir, $file));
}
}
}
@@ -896,9 +901,11 @@ sub update {
}
my %args;
-getopts("fqh", \%args);
+Getopt::Long::Configure("bundling");
+GetOptions(\%args, "f|force", "q|quiet", "v|verbose", "h|help");
-pod2usage(-verbose => 1) if @ARGV < 1 || $args{"h"};
+pod2usage(-exitval => 0, -verbose => 2) if $args{"h"};
+pod2usage(-exitval => 1, -verbose => 1) if @ARGV < 1;
my $cmd = shift;
@@ -916,7 +923,7 @@ if ($cmd eq "mv") {
if (@ARGV >= 1) {
$dir = $ARGV[0];
}
- check_add_new_files $dir;
+ check_add_new_files $dir, \%args;
} elsif ($cmd eq "checknew") {
my $dir = ".";
if (@ARGV >= 1) {
@@ -934,7 +941,7 @@ if ($cmd eq "mv") {
if (@ARGV >= 1) {
$dir = $ARGV[0];
}
- remove_old_files $dir;
+ remove_old_files $dir, \%args;
} elsif ($cmd eq "check") {
if (@ARGV < 1) {
check_files \%args, ".";
@@ -946,7 +953,7 @@ if ($cmd eq "mv") {
if (@ARGV >= 1) {
$dir = $ARGV[0];
}
- clean_files $dir;
+ clean_files $dir, \%args;
} elsif ($cmd eq "extract") {
my $src_dir = ".";
my $dst_dir;
@@ -981,8 +988,6 @@ if ($cmd eq "mv") {
die "update requires at least one argument\n";
}
update @ARGV;
-} elsif ($cmd eq "help") {
- pod2usage(-exitval => 0, -verbose => 2);
}
__END__
@@ -993,17 +998,55 @@ lumia.pl - Manage checksums on a filesystem
=head1 SYNOPSIS
-B<lumia.pl> [-qfh] command arguments
+B<lumia.pl> command [-hqfv] arguments
=head1 OPTIONS
=over 8
-=item B<addnew> [directory]
+=item B<-h>, B<--help>
+
+Show the full documentation.
+
+=item B<-q>, B<--quiet>
+
+Only output errors.
+
+=item B<-f>, B<--force>
+
+Overwrite files without prompting for confirmation.
+
+=item B<-v>, B<--verbose>
+
+Print each file that is processed by the command.
+
+=back
+
+See the full documentation for details on which commands support which options
+and what they do.
+
+It does not matter if the options are written before or after the command.
+
+If C<--> is written anywhere on the command line, option parsing is stopped,
+so that files starting with a hyphen can still be specified.
+
+Note that C<-q> and C<-f> are in no way opposites and are, in fact, never
+supported on the same command.
+
+Note further that this is very inconsistent, like the rest of the program, but
+the author has made too many bad decisions to rectify that problem at the moment.
+
+=head1 COMMANDS
+
+=over 8
+
+=item B<addnew> [-q] [directory]
Walks through B<directory>, adding all new files to the checksum database.
B<directory> defaults to the current directory.
+C<-q> suppresses the printing of each file or directory as it is added.
+
=item B<checknew> [directory]
Walks through B<directory>, printing all files that aren't part of the checksum
@@ -1014,26 +1057,53 @@ database. B<directory> defaults to the current directory.
Prints all files in the checksum database that do not exist on the filesystem anymore.
B<directory> defaults to the current directory.
-=item B<rmold> [directory]
+=item B<rmold> [-q] [directory]
Deletes all files found by B<checkold>. B<directory> defaults to the current directory.
-=item B<check> [directory]
+C<-q> suppresses the printing of each file as it is removed.
+
+=item B<check> [-q] file/directory ...
+
+Verifies the checksums of all files given, recursing through any directories. If no
+files or directories are given, the current directory is used.
+
+Note that the checksum database in the corresponding directory will be read again for
+every file given on the command line, even if 1000 files in the same directory are given.
+This problem does not occur when recursing through directories, so it is best to only
+give files directly when checking a few. This problem wouldn't be too difficult to
+fix, but, frankly, I'm too lazy, especially since I only added the feature to check
+files individually as a convenience when I want to quickly check a single file in a
+large directory.
+
+To explain why it is this way: The directory recursion is done using an iterator, which
+has the directories pushed onto its queue in the beginning. The iterator only returns
+directories, which are then checked all in one go, but this means that files given on
+the command line need to be handled specially.
-Recurses through B<directory>, checking all checksums in the database against the new
-checksums of the files on the filesystem. B<directory> defaults to the current directory.
+C<-q> suppresses the printing of all good checksums but still allows a message to
+be printed when a checksum failed.
-=item B<clean> [directory]
+=item B<clean> [-q] [directory]
Removes all lumia special files used to store the checksum database from B<directory>
recursively. B<directory> defaults to the current directory.
-=item B<extract> [source] destination
+Note that this recurses through the entire directory tree, not just the part that is
+actually linked together by the checksum database.
+
+C<-q> suppresses the printing of each file as it is deleted.
+
+=item B<extract> [-v] [source] destination
Recreates the entire directory structure from B<source> in B<destination>, but only
copies the special files used to store the checksum database. B<source> defaults to
the current directory.
+C<-v> prints each file as it is copied.
+
+Note that this overwrites files in the destination directory without confirmation.
+
=item B<mkdir> directory ...
Creates the given directories, initializing them with empty checksum database files.
@@ -1042,15 +1112,29 @@ Creates the given directories, initializing them with empty checksum database fi
Recalculates the checksums for the given files and replaces them in the database.
-Note: Directories are ignored.
+Note: Directories given as arguments are ignored.
+
+This is mainly meant to quickly "touch" a file after it was modified (e.g. a
+notes file that is occasionally updated).
+
+=item B<rm> [-f] file ...
+
+Removes the given files and directories recursively from the filesystem and
+checksum database. The following caveats apply:
-=item B<rm> file ...
+If any actual errors occur while deleting the file/directory (i.e. the system
+command C<rm> returns a non-zero exit value), the checksum or directory B<is
+left in the database>. If the system C<rm> does not return a non-zero exit value,
+but the file/directory still exists afterwards (e.g. there was a permission
+error and the user answered "n" when prompted), a warning message is printed,
+but the files B<are removed from the database> (if the database can be
+written to).
-Removes the given files and directories from the filesystem and checksum database.
+C<-f> is passed through to the system C<rm> command.
-=item B<cp> source target
+=item B<cp> [-vf] source target
-=item B<cp> source ... directory
+=item B<cp> [-vf] source ... directory
Copies the given source files, updating the checksum database in the process.
@@ -1059,9 +1143,25 @@ which is then copied to the target.
If the last argument is a directory, all source arguments are copied into it.
-=item B<mv> source target
+B<cp> will issue a warning and skip to the next argument if it is asked to
+merge a directory with an already existing directory. For instance, attempting
+to run C<cp dir1 dir2>, where C<dir2> already contains a directory named
+C<dir1>, will result in an error. This may change in the future, when the
+program is modified to recursively copy the files manually, instead of simply
+calling the system C<cp> on each of the arguments. If this was supported in
+the current version, none of the checksums inside that directory would be
+updated, so it wouldn't be very useful.
+
+C<-v> is passed through to the system C<cp> command.
+
+C<-f> silently overwrites files without prompting the user, much like the
+C<-f> option in the system C<cp> command. This is handled manually by the
+program, though, in order to actually determine what the user chose. See
+also the caveat mentioned above.
-=item B<mv> source ... directory
+=item B<mv> [-f] source target
+
+=item B<mv> [-f] source ... directory
Moves the given source files, updating the checksum database in the process.
@@ -1071,31 +1171,193 @@ argument, which is renamed to the target name.
If the last argument is an existing directory, all source arguments are moved
into it.
+B<mv> behaves the same as B<rm> with regards to checking if the source file
+is still present after the operation and other error handling.
+
+C<-f> is handled in the same manner as with B<cp>.
+
=back
-=head1 CAVEATS
+=head1 DESCRIPTION
-B<rm> automatically deletes the files recursively. For each of the arguments,
-the following caveats apply:
-If any actual errors occur while deleting the file/directory (i.e. the system
-command C<rm> returns a non-zero exit value), the checksum or directory B<is
-left in the database>. If the system C<rm> does not return a non-zero exit value,
-but the file/directory still exists afterwards (e.g. there was a permission
-error and the user answered "n" when prompted), a warning message is printed,
-but the files B<are removed from the database> (if the database can be
-written to).
+lumia.pl is meant for managing checksums of files in order to prevent bitrot.
+It does this by storing several special files in each directory to keep track
+of the checksums:
-B<mv> behaves the same as B<rm> with regards to checking if the source file
-is still present after the operation.
+=over 8
-B<cp> will issue a warning and skip to the next argument if it is asked to
-merge a directory with an already existing directory. For instance, attempting
-to run C<cp dir1 dir2>, where C<dir2> already contains a directory named
-C<dir1>, will result in an error. This may change in the future, when the
-program is modified to recursively copy the files manually, instead of simply
-calling the system C<cp> on each of the arguments. If this was supported in
-the current version, none of the checksums inside that directory would be
-updated, so it wouldn't be very useful.
+=item B<.lumidify_archive_cksums>
+
+Contains the checksums of all files in the directory.
+
+=item B<.lumidify_archive_dirs>
+
+Contains a list of all directories in the directory.
+
+=item B<.lumidify_archive_cksums.cksum>
+
+Contains the checksums of B<.lumidify_archive_cksums> and B<.lumidify_archive_dirs>
+in order to provide two-layer protection against bitrot.
+
+=item B<.lumidify_archive_ignore>
+
+Contains a list of files and directories that should be ignored by lumia.pl.
+Note that this is only read and never written to, unless the command B<clean>
+is used. It is, however, still copied over by the B<extract> command.
+
+=back
+
+When the documentation for the commands talks about the "checksum database",
+it simply means these files.
+
+All file/directory names are enclosed in quotes, with any backslashes or quotes
+inside the name escaped with another backslash. The names are allowed to have
+newlines in them.
+
+The list files only contain a list of filenames, with a newline between the
+closing quote of one name and the opening quote of the next one.
+
+The checksum files additionally contain the output of the checksum program
+used and a space before the starting quote of the filename.
+
+=head1 MOTIVATION
+
+There are already several programs that can be used to check for bitrot,
+as listed in L</"SEE ALSO">. However, all programs I tried either were
+much too complicated for my taste or just did everything behind my back.
+I wanted a simple tool that did exactly what I told it to and also allowed
+me to keep the old checksums when reorganizing files, in order to avoid
+regenerating the checksums from corrupt files. Since I couldn't find those
+features in any program I tried, I wrote my own.
+
+=head1 DESIGN DECISIONS
+
+It may strike some readers as a peculiar idea to save the checksum files in
+I<every single directory>, but this choice was made after much deliberation.
+The other option I could think of was to have one big database, but that
+would have made all commands much more difficult to implement and additionally
+necessitated opening the entire database for every operation. With individual
+files in each directory, operations like B<cp> become quite trivial (ignoring
+all the edge cases) since only the toplevel checksums need to be copied to
+the new destination, and any subdirectories already contain the checksums.
+
+This method is not without its drawbacks, however. The most glaring problem
+I have found is that there is no way to store the checksums of read-only
+directories or any special directories that cannot be littered with the
+checksum files because that would clash with other software. Despite these
+drawbacks, however, I decided to stick with it because it works for almost
+all cases and doesn't have any of the serious drawbacks that other options
+would have had.
+
+The names of the special files were chosen to be ".lumidify_archive*" not
+out of vanity, but mainly because I couldn't think of any regular files
+with those names, making them a good choice to avoid clashes.
+
+The name of the program, C<lumia.pl> (for "lumidify archive"), was similarly
+chosen because it did not clash with any programs installed on my system and
+thus allowed for easy tab-completion.
+
+=head1 HASH ALGORITHMS
+
+By default, the simple cksum algorithm is used to get the checksums. This
+is not very secure, but the main purpose of the program is to prevent
+bitrot, for which cksum should be sufficient, especially since it is much
+faster than other algorithms.
+
+There is currently no convenient way to change the algorithm other than
+changing the $CKSUM_CMD and $CKSUM_NUMFIELDS variables at the top of
+lumia.pl. $CKSUM_CMD must be the command that returns only the checksum
+when it is given a file, and $CKSUM_NUMFIELDS specifies the number of
+space-separated fields the checksum consists of. This has to be specified
+because cksum returns two numbers while sha256, for instance, only outputs
+one long checksum.
+
+This could be improved a lot, especially since there really isn't any
+reason why the file-reading function can't just take whatever comes before
+the first opening quote as the checksum, making $CKSUM_NUMFIELDS redundant.
+I'm too lazy to change that right now, though.
+
+=head1 USAGE SCENARIOS
+
+=over 8
+
+=item B<Security auditing>
+
+This program is B<NOT> designed to provide any security auditing, as should
+be clear from the fact that the checksums are stored right in the same
+directory as the files. See mtree(8) for that.
+
+If you want to, however, you could set $CKSUM_CMD to a secure hash (not cksum)
+and B<extract> the checksums to a separate directory, which you keep in a
+safe place. You could then use the regular C<cp> command to simply replace
+all the checksums with the ones from your backup, in case an attacker modified
+the checksum database in the directory with the actual files you're trying to
+protect. I don't know if there would be any point in doing that, though.
+
+=item B<Managing archives>
+
+This is the purpose I wrote the program for.
+
+You can simply initialize your archive directory with the B<addnew> command.
+Whenever you add new files, just run B<addnew> again. If you want to reorganize
+the archive, you can use the limited commands available.
+
+I usually just use rsync(1) to copy the entire archive directory over to other
+backup drives and then use the B<check> command again on the new drive.
+
+I also have checksums for the main data directory on my computer (except for
+things like git repositories, which I don't want littered with the database
+files). Here, I use the B<update> command for files that I edit more often
+and occasionally run B<check> on the entire directory.
+
+Since the database files are written in each directory, you can run the
+B<addnew> command in any subdirectory when you've added new files there.
+
+=back
+
+=head1 PERFORMANCE
+
+Due to the extensive use of iterators and the author's bad life decisions,
+some functions, such as B<addnew> and B<check>, run more slowly than they
+would if they were programmed more efficiently. Too bad.
+
+=head1 PORTABILITY
+
+This program was written on OpenBSD. It will probably work on most other
+reasonably POSIX-Compliant systems, although I cannot guarantee anything.
+$CKSUM_CMD may need to be modified at the top of the file. The file
+operation commands are called directly with system(), so those need to
+be available.
+
+It will most certainly not work on Windows, but that shouldn't be a
+problem for anyone important.
+
+=head1 BUGS
+
+All system commands (unless I forgot some) are called with "--" before
+listing the actual files, so files beginning with hyphens should be
+supported. I have tested the commands with filenames starting with spaces
+and hyphens and also containing newlines, but there may very well be issues
+still. Please notify me if you find any filenames that do not work. Handling
+filenames properly is difficult.
+
+There are probably many other edge cases, especially in the B<mv>, B<cp>,
+and B<rm> commands. Please notify me if you find an issue.
+
+=head1 EXIT STATUS
+
+Always 0, unless the arguments given were invalid. We don't do errors around here.
+
+On a more serious note - I should probably change that at some point.
+For the time being, if you want to run B<check> in a script, you can test
+the output printed when the C<-q> option is used, since this won't output
+anything if there are no errors. Do note, though, that actual errors (file not
+found, etc.) are printed to STDERR, while incorrect checksums are printed
+to STDOUT.
+
+=head1 SEE ALSO
+
+par2(1), mtree(8), aide(1), bitrot(no man page)
=head1 LICENSE