lumia

Archive checksum manager
git clone git://lumidify.org/git/lumia.git
Log | Files | Refs

commit 4fbf400a3cf553ee8119f3d27072c235657102c0
parent f35b80dd2c875acfa769b7f1e69346c43e8cdf27
Author: lumidify <nobody@lumidify.org>
Date:   Tue, 24 Mar 2020 15:24:16 +0100

Change option parsing; add more documentation

Diffstat:
DTODO | 3---
Mlumia.pl | 394+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
2 files changed, 328 insertions(+), 69 deletions(-)

diff --git a/TODO b/TODO @@ -1,3 +0,0 @@ -ALLOW FORCE ON MV AND CP (don't prompt for overwrite) -Allow to run command on multiple files but keep checksum (e.g. convmv) -update command - if file was edited diff --git a/lumia.pl b/lumia.pl @@ -1,13 +1,10 @@ #!/usr/bin/env perl -# FIXME: some way to avoid writing .lumidify* in dirs but still index them? e.g. Code/CMSG -# FIXME: cksum don't create malformed line if permission denied -# FIXME: ignore all except for a certain file/folder -# FIXME: store modified date and checksum filed with changed date -# FIXME: allow different hash types -# FIXME: don't write anything if cksum fails (will create malformed line) -# FIXME: add option to just check dir structure or maybe check if everything exists -# FIXME: add option to compare cksums of two dirs +# TODO: some way to avoid writing .lumidify* in dirs but still index them? +# TODO: store modified date and checksum filed with changed date +# TODO: add option to just check dir structure or maybe check if everything exists +# TODO: add option to compare cksums of two dirs +# TODO: exit status! use strict; use warnings; @@ -16,7 +13,7 @@ use File::Basename qw(basename dirname); use File::Path qw(remove_tree); use String::ShellQuote; use Pod::Usage; -use Getopt::Std; +use Getopt::Long; # the file used to store checksums for files my $CKSUM_FILE = ".lumidify_archive_cksums"; @@ -27,7 +24,13 @@ my $IGNORE_FILE = ".lumidify_archive_ignore"; # the file containing checksums of $CKSUM_FILE and $DIR_FILE my $DOUBLE_CKSUM_FILE = ".lumidify_archive_cksums.cksum"; +# uncomment this instead of the lines below to use +# sha256 instead of cksum as the hash algorithm +#my $CKSUM_CMD = 'sha256 -q'; +#my $CKSUM_NUMFIELDS = 1; my $CKSUM_CMD = 'cksum -q'; +my $CKSUM_NUMFIELDS = 2; + my %SPECIAL_FILES = ( $CKSUM_FILE => 1, $DIR_FILE => 1, @@ -105,13 +108,13 @@ sub make_lumia_iter { # remove all special lumia files from the given directory sub clean_files { - my $dir = shift; + my ($dir, $args) = @_; my $iter = make_file_iter_basic sub {exists $SPECIAL_FILES{basename $_[0]};}, $dir; while (my $file = $iter->()) { if (!unlink $file) { warn "WARNING: Unable to remove file \"$file\"!\n"; } else { - print "Deleted \"$file\"\n"; + print "Deleted \"$file\"\n" if !$args->{"q"}; } } } @@ -174,13 +177,13 @@ sub read_cksum_file { my ($file, $cksums) = @_; return read_file $file, $cksums, sub { my $line = shift; - my @fields = split(/ /, $line, 3); - if ($#fields != 2) { + my @fields = split(/ /, $line, $CKSUM_NUMFIELDS+1); + if (@fields != $CKSUM_NUMFIELDS+1) { warn "WARNING: Malformed line \"$line\" in file \"$file\"\n"; return; } - my $cur_cksum = join(" ", @fields[0,1]); - my $cur_str = $fields[2]; + my $cur_cksum = join(" ", @fields[0..$CKSUM_NUMFIELDS-1]); + my $cur_str = $fields[$CKSUM_NUMFIELDS]; return ($cur_cksum, $cur_str); }; } @@ -343,7 +346,7 @@ sub check_new_files { # add all new files in $top_dir to the checksum files sub check_add_new_files { - my $top_dir = shift; + my ($top_dir, $args) = @_; my $changed_dirs = 0; my $changed_files = 0; check_new_files $top_dir, sub { @@ -372,7 +375,7 @@ sub check_add_new_files { close $fh; $changed_files = 1; } - print "Added \"$fullpath\"\n"; + print "Added \"$fullpath\"\n" if !$args->{"q"}; }, sub { if (-f "$_[0]/$DOUBLE_CKSUM_FILE") { if (!check_cksums $_[0], $DOUBLE_CKSUM_FILE, 1) { @@ -455,6 +458,8 @@ sub write_cksums { print $dirs_fh '"' . escape_filename($key) . '"' . "\n"; } } + close $files_fh if defined $files_fh; + close $dirs_fh if defined $dirs_fh; if (@special_files) { write_special_cksums $dir, @special_files; } @@ -480,7 +485,7 @@ sub check_old_files { # clean up the lumia checksum files, removing any files that aren't present # on the filesystem anymore sub remove_old_files { - my $top_dir = shift; + my ($top_dir, $args) = @_; my $iter = make_lumia_iter $top_dir; while (my $dir = $iter->()) { if (!-e $dir) { @@ -490,7 +495,7 @@ sub remove_old_files { if (exists $lumia_dirs->{$child}) { delete $lumia_dirs->{$child}; write_file "$parent/$DIR_FILE", $lumia_dirs; - print "Removed \"$dir\" from \"$parent/$DIR_FILE\"\n"; + print "Removed \"$dir\" from \"$parent/$DIR_FILE\"\n" if !$args->{"q"}; write_special_cksums $parent, $DIR_FILE; } } else { @@ -499,7 +504,7 @@ sub remove_old_files { foreach my $file (keys %$cksums) { if (!-e "$dir/$file") { delete $cksums->{$file}; - print "Removed \"$dir/$file\" from \"$dir/$CKSUM_FILE\"\n"; + print "Removed \"$dir/$file\" from \"$dir/$CKSUM_FILE\"\n" if !$args->{"q"}; $found = 1; } } @@ -552,7 +557,6 @@ sub prompt_overwrite { return 0; } -# FIXME: handle different cases like move_files # copies the $src files to $dst and updates the checksums in $dst # $src: list of source paths # $dst: destination directory or file (in latter case only one src is allowed) @@ -761,7 +765,7 @@ sub move_files { # remove a file or directory from the filesystem sub remove_file_dir { my ($path, $args) = @_; - my $options = $args->{"f"} ? "-rf" : "-f"; + my $options = $args->{"f"} ? "-rf" : "-r"; if (system("rm", $options, "--", $path)) { return 1; } @@ -851,8 +855,9 @@ sub make_dirs { # extract all special lumia files from $src_dir to $dst_dir, recreating the # entire directory structure in the process sub extract { - my ($src_dir, $dst_dir) = @_; + my ($src_dir, $dst_dir, $args) = @_; my $iter = make_lumia_iter $src_dir; + my $options = $args->{"v"} ? "-av" : "-a"; while (my $dir = $iter->()) { my $final_dir = abs2rel $dir, $src_dir; my $fulldir = catfile $dst_dir, $final_dir; @@ -860,7 +865,7 @@ sub extract { foreach my $file (keys %SPECIAL_FILES) { my $filepath = catfile $dir, $file; if (-e $filepath) { - system("cp", "-aiv", "--", $filepath, catfile($fulldir, $file)); + system("cp", $options, "--", $filepath, catfile($fulldir, $file)); } } } @@ -896,9 +901,11 @@ sub update { } my %args; -getopts("fqh", \%args); +Getopt::Long::Configure("bundling"); +GetOptions(\%args, "f|force", "q|quiet", "v|verbose", "h|help"); -pod2usage(-verbose => 1) if @ARGV < 1 || $args{"h"}; +pod2usage(-exitval => 0, -verbose => 2) if $args{"h"}; +pod2usage(-exitval => 1, -verbose => 1) if @ARGV < 1; my $cmd = shift; @@ -916,7 +923,7 @@ if ($cmd eq "mv") { if (@ARGV >= 1) { $dir = $ARGV[0]; } - check_add_new_files $dir; + check_add_new_files $dir, \%args; } elsif ($cmd eq "checknew") { my $dir = "."; if (@ARGV >= 1) { @@ -934,7 +941,7 @@ if ($cmd eq "mv") { if (@ARGV >= 1) { $dir = $ARGV[0]; } - remove_old_files $dir; + remove_old_files $dir, \%args; } elsif ($cmd eq "check") { if (@ARGV < 1) { check_files \%args, "."; @@ -946,7 +953,7 @@ if ($cmd eq "mv") { if (@ARGV >= 1) { $dir = $ARGV[0]; } - clean_files $dir; + clean_files $dir, \%args; } elsif ($cmd eq "extract") { my $src_dir = "."; my $dst_dir; @@ -981,8 +988,6 @@ if ($cmd eq "mv") { die "update requires at least one argument\n"; } update @ARGV; -} elsif ($cmd eq "help") { - pod2usage(-exitval => 0, -verbose => 2); } __END__ @@ -993,17 +998,55 @@ lumia.pl - Manage checksums on a filesystem =head1 SYNOPSIS -B<lumia.pl> [-qfh] command arguments +B<lumia.pl> command [-hqfv] arguments =head1 OPTIONS =over 8 -=item B<addnew> [directory] +=item B<-h>, B<--help> + +Show the full documentation. + +=item B<-q>, B<--quiet> + +Only output errors. + +=item B<-f>, B<--force> + +Overwrite files without prompting for confirmation. + +=item B<-v>, B<--verbose> + +Print each file that is processed by the command. + +=back + +See the full documentation for details on which commands support which options +and what they do. + +It does not matter if the options are written before or after the command. + +If C<--> is written anywhere on the command line, option parsing is stopped, +so that files starting with a hyphen can still be specified. + +Note that C<-q> and C<-f> are in no way opposites and are, in fact, never +supported on the same command. + +Note further that this is very inconsistent, like the rest of the program, but +the author has made too many bad decisions to rectify that problem at the moment. + +=head1 COMMANDS + +=over 8 + +=item B<addnew> [-q] [directory] Walks through B<directory>, adding all new files to the checksum database. B<directory> defaults to the current directory. +C<-q> suppresses the printing of each file or directory as it is added. + =item B<checknew> [directory] Walks through B<directory>, printing all files that aren't part of the checksum @@ -1014,26 +1057,53 @@ database. B<directory> defaults to the current directory. Prints all files in the checksum database that do not exist on the filesystem anymore. B<directory> defaults to the current directory. -=item B<rmold> [directory] +=item B<rmold> [-q] [directory] Deletes all files found by B<checkold>. B<directory> defaults to the current directory. -=item B<check> [directory] +C<-q> suppresses the printing of each file as it is removed. + +=item B<check> [-q] file/directory ... + +Verifies the checksums of all files given, recursing through any directories. If no +files or directories are given, the current directory is used. + +Note that the checksum database in the corresponding directory will be read again for +every file given on the command line, even if 1000 files in the same directory are given. +This problem does not occur when recursing through directories, so it is best to only +give files directly when checking a few. This problem wouldn't be too difficult to +fix, but, frankly, I'm too lazy, especially since I only added the feature to check +files individually as a convenience when I want to quickly check a single file in a +large directory. + +To explain why it is this way: The directory recursion is done using an iterator, which +has the directories pushed onto its queue in the beginning. The iterator only returns +directories, which are then checked all in one go, but this means that files given on +the command line need to be handled specially. -Recurses through B<directory>, checking all checksums in the database against the new -checksums of the files on the filesystem. B<directory> defaults to the current directory. +C<-q> suppresses the printing of all good checksums but still allows a message to +be printed when a checksum failed. -=item B<clean> [directory] +=item B<clean> [-q] [directory] Removes all lumia special files used to store the checksum database from B<directory> recursively. B<directory> defaults to the current directory. -=item B<extract> [source] destination +Note that this recurses through the entire directory tree, not just the part that is +actually linked together by the checksum database. + +C<-q> suppresses the printing of each file as it is deleted. + +=item B<extract> [-v] [source] destination Recreates the entire directory structure from B<source> in B<destination>, but only copies the special files used to store the checksum database. B<source> defaults to the current directory. +C<-v> prints each file as it is copied. + +Note that this overwrites files in the destination directory without confirmation. + =item B<mkdir> directory ... Creates the given directories, initializing them with empty checksum database files. @@ -1042,15 +1112,29 @@ Creates the given directories, initializing them with empty checksum database fi Recalculates the checksums for the given files and replaces them in the database. -Note: Directories are ignored. +Note: Directories given as arguments are ignored. + +This is mainly meant to quickly "touch" a file after it was modified (e.g. a +notes file that is occasionally updated). + +=item B<rm> [-f] file ... + +Removes the given files and directories recursively from the filesystem and +checksum database. The following caveats apply: -=item B<rm> file ... +If any actual errors occur while deleting the file/directory (i.e. the system +command C<rm> returns a non-zero exit value), the checksum or directory B<is +left in the database>. If the system C<rm> does not return a non-zero exit value, +but the file/directory still exists afterwards (e.g. there was a permission +error and the user answered "n" when prompted), a warning message is printed, +but the files B<are removed from the database> (if the database can be +written to). -Removes the given files and directories from the filesystem and checksum database. +C<-f> is passed through to the system C<rm> command. -=item B<cp> source target +=item B<cp> [-vf] source target -=item B<cp> source ... directory +=item B<cp> [-vf] source ... directory Copies the given source files, updating the checksum database in the process. @@ -1059,9 +1143,25 @@ which is then copied to the target. If the last argument is a directory, all source arguments are copied into it. -=item B<mv> source target +B<cp> will issue a warning and skip to the next argument if it is asked to +merge a directory with an already existing directory. For instance, attempting +to run C<cp dir1 dir2>, where C<dir2> already contains a directory named +C<dir1>, will result in an error. This may change in the future, when the +program is modified to recursively copy the files manually, instead of simply +calling the system C<cp> on each of the arguments. If this was supported in +the current version, none of the checksums inside that directory would be +updated, so it wouldn't be very useful. + +C<-v> is passed through to the system C<cp> command. + +C<-f> silently overwrites files without prompting the user, much like the +C<-f> option in the system C<cp> command. This is handled manually by the +program, though, in order to actually determine what the user chose. See +also the caveat mentioned above. -=item B<mv> source ... directory +=item B<mv> [-f] source target + +=item B<mv> [-f] source ... directory Moves the given source files, updating the checksum database in the process. @@ -1071,31 +1171,193 @@ argument, which is renamed to the target name. If the last argument is an existing directory, all source arguments are moved into it. +B<mv> behaves the same as B<rm> with regards to checking if the source file +is still present after the operation and other error handling. + +C<-f> is handled in the same manner as with B<cp>. + =back -=head1 CAVEATS +=head1 DESCRIPTION -B<rm> automatically deletes the files recursively. For each of the arguments, -the following caveats apply: -If any actual errors occur while deleting the file/directory (i.e. the system -command C<rm> returns a non-zero exit value), the checksum or directory B<is -left in the database>. If the system C<rm> does not return a non-zero exit value, -but the file/directory still exists afterwards (e.g. there was a permission -error and the user answered "n" when prompted), a warning message is printed, -but the files B<are removed from the database> (if the database can be -written to). +lumia.pl is meant for managing checksums of files in order to prevent bitrot. +It does this by storing several special files in each directory to keep track +of the checksums: -B<mv> behaves the same as B<rm> with regards to checking if the source file -is still present after the operation. +=over 8 -B<cp> will issue a warning and skip to the next argument if it is asked to -merge a directory with an already existing directory. For instance, attempting -to run C<cp dir1 dir2>, where C<dir2> already contains a directory named -C<dir1>, will result in an error. This may change in the future, when the -program is modified to recursively copy the files manually, instead of simply -calling the system C<cp> on each of the arguments. If this was supported in -the current version, none of the checksums inside that directory would be -updated, so it wouldn't be very useful. +=item B<.lumidify_archive_cksums> + +Contains the checksums of all files in the directory. + +=item B<.lumidify_archive_dirs> + +Contains a list of all directories in the directory. + +=item B<.lumidify_archive_cksums.cksum> + +Contains the checksums of B<.lumidify_archive_cksums> and B<.lumidify_archive_dirs> +in order to provide two-layer protection against bitrot. + +=item B<.lumidify_archive_ignore> + +Contains a list of files and directories that should be ignored by lumia.pl. +Note that this is only read and never written to, unless the command B<clean> +is used. It is, however, still copied over by the B<extract> command. + +=back + +When the documentation for the commands talks about the "checksum database", +it simply means these files. + +All file/directory names are enclosed in quotes, with any backslashes or quotes +inside the name escaped with another backslash. The names are allowed to have +newlines in them. + +The list files only contain a list of filenames, with a newline between the +closing quote of one name and the opening quote of the next one. + +The checksum files additionally contain the output of the checksum program +used and a space before the starting quote of the filename. + +=head1 MOTIVATION + +There are already several programs that can be used to check for bitrot, +as listed in L</"SEE ALSO">. However, all programs I tried either were +much too complicated for my taste or just did everything behind my back. +I wanted a simple tool that did exactly what I told it to and also allowed +me to keep the old checksums when reorganizing files, in order to avoid +regenerating the checksums from corrupt files. Since I couldn't find those +features in any program I tried, I wrote my own. + +=head1 DESIGN DECISIONS + +It may strike some readers as a peculiar idea to save the checksum files in +I<every single directory>, but this choice was made after much deliberation. +The other option I could think of was to have one big database, but that +would have made all commands much more difficult to implement and additionally +necessitated opening the entire database for every operation. With individual +files in each directory, operations like B<cp> become quite trivial (ignoring +all the edge cases) since only the toplevel checksums need to be copied to +the new destination, and any subdirectories already contain the checksums. + +This method is not without its drawbacks, however. The most glaring problem +I have found is that there is no way to store the checksums of read-only +directories or any special directories that cannot be littered with the +checksum files because that would clash with other software. Despite these +drawbacks, however, I decided to stick with it because it works for almost +all cases and doesn't have any of the serious drawbacks that other options +would have had. + +The names of the special files were chosen to be ".lumidify_archive*" not +out of vanity, but mainly because I couldn't think of any regular files +with those names, making them a good choice to avoid clashes. + +The name of the program, C<lumia.pl> (for "lumidify archive"), was similarly +chosen because it did not clash with any programs installed on my system and +thus allowed for easy tab-completion. + +=head1 HASH ALGORITHMS + +By default, the simple cksum algorithm is used to get the checksums. This +is not very secure, but the main purpose of the program is to prevent +bitrot, for which cksum should be sufficient, especially since it is much +faster than other algorithms. + +There is currently no convenient way to change the algorithm other than +changing the $CKSUM_CMD and $CKSUM_NUMFIELDS variables at the top of +lumia.pl. $CKSUM_CMD must be the command that returns only the checksum +when it is given a file, and $CKSUM_NUMFIELDS specifies the number of +space-separated fields the checksum consists of. This has to be specified +because cksum returns two numbers while sha256, for instance, only outputs +one long checksum. + +This could be improved a lot, especially since there really isn't any +reason why the file-reading function can't just take whatever comes before +the first opening quote as the checksum, making $CKSUM_NUMFIELDS redundant. +I'm too lazy to change that right now, though. + +=head1 USAGE SCENARIOS + +=over 8 + +=item B<Security auditing> + +This program is B<NOT> designed to provide any security auditing, as should +be clear from the fact that the checksums are stored right in the same +directory as the files. See mtree(8) for that. + +If you want to, however, you could set $CKSUM_CMD to a secure hash (not cksum) +and B<extract> the checksums to a separate directory, which you keep in a +safe place. You could then use the regular C<cp> command to simply replace +all the checksums with the ones from your backup, in case an attacker modified +the checksum database in the directory with the actual files you're trying to +protect. I don't know if there would be any point in doing that, though. + +=item B<Managing archives> + +This is the purpose I wrote the program for. + +You can simply initialize your archive directory with the B<addnew> command. +Whenever you add new files, just run B<addnew> again. If you want to reorganize +the archive, you can use the limited commands available. + +I usually just use rsync(1) to copy the entire archive directory over to other +backup drives and then use the B<check> command again on the new drive. + +I also have checksums for the main data directory on my computer (except for +things like git repositories, which I don't want littered with the database +files). Here, I use the B<update> command for files that I edit more often +and occasionally run B<check> on the entire directory. + +Since the database files are written in each directory, you can run the +B<addnew> command in any subdirectory when you've added new files there. + +=back + +=head1 PERFORMANCE + +Due to the extensive use of iterators and the author's bad life decisions, +some functions, such as B<addnew> and B<check>, run more slowly than they +would if they were programmed more efficiently. Too bad. + +=head1 PORTABILITY + +This program was written on OpenBSD. It will probably work on most other +reasonably POSIX-Compliant systems, although I cannot guarantee anything. +$CKSUM_CMD may need to be modified at the top of the file. The file +operation commands are called directly with system(), so those need to +be available. + +It will most certainly not work on Windows, but that shouldn't be a +problem for anyone important. + +=head1 BUGS + +All system commands (unless I forgot some) are called with "--" before +listing the actual files, so files beginning with hyphens should be +supported. I have tested the commands with filenames starting with spaces +and hyphens and also containing newlines, but there may very well be issues +still. Please notify me if you find any filenames that do not work. Handling +filenames properly is difficult. + +There are probably many other edge cases, especially in the B<mv>, B<cp>, +and B<rm> commands. Please notify me if you find an issue. + +=head1 EXIT STATUS + +Always 0, unless the arguments given were invalid. We don't do errors around here. + +On a more serious note - I should probably change that at some point. +For the time being, if you want to run B<check> in a script, you can test +the output printed when the C<-q> option is used, since this won't output +anything if there are no errors. Do note, though, that actual errors (file not +found, etc.) are printed to STDERR, while incorrect checksums are printed +to STDOUT. + +=head1 SEE ALSO + +par2(1), mtree(8), aide(1), bitrot(no man page) =head1 LICENSE