--- /dev/null
+#!/usr/bin/perl
+
+# Copyright (C) 2020 Jacob Bachmeyer
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+repo-import-loose-files - Import loose files into revision control
+
+=head1 SYNOPSIS
+
+ repo-import-loose-files.pl [options] <root of file collection>...
+
+ Options:
+ -h --help print brief help message
+ -v --verbose print additional information while running
+ -n --dry-run print commands but do not execute
+ --author=AUTHOR specify author for generated commits
+ --repo-base=BASE repository base directory (if not tarball root)
+ --tar-listing=FILE load timestamps from a file listing
+
+=head1 OPTIONS
+
+=over
+
+=item B<--help> B<-h>
+
+Print brief help message and exit.
+
+=item B<--verbose> B<-v>
+
+Print additonal information while running. Repeating this option further
+increases verbosity.
+
+=item B<--dry-run> B<-n>
+
+Print the commands that would be executed but do not actually execute them.
+
+=item B<--author=>I<AUTHOR>
+
+Specify notional author of the imported commits.
+
+=item B<--repo-base=>I<BASE>
+
+Relative file name from current directory to repository root directory. If
+not given, the repository is assumed to be in the current directory, and
+the root of the tarball is assumed to be the root of the repository.
+
+=item B<--tar-listing=>I<FILE>
+
+Read the provided file (as procduced by the -vt option to GNU tar) instead
+of searching the directory tree. The tarball should have been unpacked in
+the directory given as the root of the file tree.
+
+=back
+
+=head1 DESCRIPTION
+
+This tool imports a collection of loose files into a Git repository,
+assuming that any subsets of the files whose names share a common prefix
+are individual files that have been tracked over time by accumulating
+editor backup files instead of using revision control software.
+
+Any suffix is acceptable, provided that the suffixed files are
+distinguishable from the primary files. The actual import ordering is
+determined by the files' timestamps.
+
+=head1 CAVEATS
+
+This program assumes that the "logical" files are the shortest prefixes
+that are both shared across multiple files and present as files under their
+own names.
+
+=cut
+
+use strict;
+use warnings;
+
+use File::Find;
+use File::Spec;
+use Date::Parse;
+use Date::Format;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my %OPT = ();
+my $DryRun = 0;
+my $Verbose = 0;
+
+GetOptions('help|h' => \$OPT{help},
+ 'dry-run|n' => \$DryRun,
+ 'verbose|v+' => \$Verbose,
+ 'author=s' => \$OPT{author},
+ 'repo-base=s' => \$OPT{repobase},
+ 'tar-listing=s@' => \$OPT{listing},
+ ) or pod2usage(2);
+pod2usage(1) if $OPT{help};
+
+unless (@ARGV) {
+ warn "No directory given.\n";
+ pod2usage(2);
+}
+
+# Each file record used in this program is a Perl hashref containing:
+# name relative name of file (as from tarball)
+# logical relative name of file (base file to be versioned)
+# size size of file in bytes
+# timestamp timestamp of file
+# link_to if set, file is symlink to this destination
+# source actual name of source file in filesystem
+
+sub cmd { # run a command if not in dry-run mode
+ my @show = map { m/[[:space:]]/ ? qq["$_"] : $_ } @_;
+ if ($DryRun) {
+ print join(' ', @show),"\n";
+ } elsif (my $pid = fork) { # in parent
+ print join(' ', @show),"\n" if $Verbose;
+ waitpid $pid, 0;
+ } else { # in child
+ chdir $OPT{repobase} if $OPT{repobase};
+ exec {$_[0]} @_;
+ }
+}
+
+###
+### Revision import routines
+###
+
+sub store_revision_in_git {
+ my $file = shift;
+
+ my $date = time2str('%Y-%m-%dT%H:%M:%S', $file->{timestamp});
+ my @commit_opts = ('--date='.$date);
+
+ push @commit_opts, '--author='.$OPT{author} if $OPT{author};
+
+ my $source = $file->{source};
+ my $target = $file->{logical};
+
+ if ($OPT{repobase}) {
+ $source = File::Spec->abs2rel($source, $OPT{repobase});
+ $target = File::Spec->abs2rel($target, $OPT{repobase});
+ }
+
+ my $message = 'Import ';
+ $message .= $file->{link_to} ? 'symlink' : 'version';
+ $message .= ' as of '.time2str('%Y-%m-%d', $file->{timestamp});
+ $message .= ' for '.$target;
+ push @commit_opts, '-m', $message;
+
+ cmd qw/cp -fp/, ($Verbose ? '-v' : ()), $source, $target;
+ cmd qw/git commit/, @commit_opts, '--', $target;
+}
+
+###
+### File collection and processing
+###
+
+my @LooseFiles = ();
+my %LooseFiles = (); # same objects as @LooseFiles, indexed by relative name
+my $Have_Symlinks = 0;
+
+# Collect the files present in the import areas.
+print "Collecting files...\n" if $Verbose;
+foreach my $dir (@ARGV) {
+ find({no_chdir => 1,
+ wanted => sub {
+ print "Found $_\n" if $Verbose > 2;
+ $Have_Symlinks = 1 if -l;
+ my $keyname = File::Spec->abs2rel($_, $dir);
+ unless (-d _) {
+ my $item = {name => $keyname,
+ timestamp => (stat(_))[9],
+ size => (stat(_))[7],
+ -l _ ? (link_to => readlink) : (),
+ source => $_};
+ push @LooseFiles, $item;
+ die "duplicate relative name $keyname"
+ if exists $LooseFiles{$keyname};
+ $LooseFiles{$keyname} = $item;
+ }
+ },
+ }, $dir);
+}
+
+die "Symlink timestamps are not reliably restored on the filesystem;\n"
+ ." use the --tar-listing option to fix them.\n"
+ if $Have_Symlinks && !$OPT{listing};
+
+# Correct timestamps by reading a tarball listing if given.
+foreach my $listing (@{$OPT{listing}}) {
+ print "Fixing timestamps from $listing...\n" if $Verbose;
+ my $ts_re = qr/\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}/;
+ open LISTING, '<', $listing
+ or die "read file listing: open $listing: $!";
+
+ while (<LISTING>) {
+ chomp;
+ die "read file listing $listing: failed parsing line:\n $_" unless
+ m<(.).{9}\s+[/[:alnum:]]+\s+(\d+)\s+($ts_re)\s+(.*)$>;
+ # $1 -> $type -- type: - => file, d => directory, l => symlink
+ # $2 -> $size -- size in bytes
+ # $3 -> $ts -- timestamp in YYYY-mm-dd HH:MM:SS
+ # $4 -> $name -- relative name
+ my $type = $1; my $size = $2; my $ts = $3; my $name = $4;
+ if ($type eq '-') { # process file
+ die "file size mismatch between file and listing"
+ unless $LooseFiles{$name}->{size} == $2;
+ my $timestamp = str2time($ts);
+ unless ($LooseFiles{$name}->{timestamp} == $timestamp) {
+ $LooseFiles{$name}->{timestamp} = $timestamp;
+ print 'Fixed file timestamp ',$LooseFiles{$name}->{timestamp},
+ ' -> ',$timestamp,' on ',$name,"\n";
+ }
+ } elsif ($type eq 'l') { # process symlink
+ my $lname; my $target;
+ ($lname, $target) = ($name =~ m/^(.*)\s+->\s+(.*)$/);
+ # tar reports symlink sizes as 0, but check target
+ die "link target mismatch for $lname"
+ unless $LooseFiles{$lname}->{link_to} eq $target;
+ my $timestamp = str2time($ts);
+ unless ($LooseFiles{$lname}->{timestamp} == $timestamp) {
+ $LooseFiles{$lname}->{timestamp} = $timestamp;
+ print 'Fixed symlink timestamp ',$LooseFiles{$lname}->{timestamp},
+ ' -> ',$timestamp,"\n on $name\n";
+ }
+ } elsif ($type eq 'd') { # skip directory
+ next;
+ } else { die "item type $type unknown in $_" }
+ }
+
+ close LISTING or die "read file listing: close $listing: $!";
+}
+
+# Sort the file list by name; this places each "primary" file ahead of its
+# old versions.
+@LooseFiles = sort { $a->{name} cmp $b->{name} } @LooseFiles;
+
+# Assign logical names to the collected files.
+{
+ # indexed by relative file name
+ my %Bins = ();
+
+ # Group the files by their logical names.
+ print "Collecting files into groups...\n" if $Verbose;
+ for (my $i = 0; $i < @LooseFiles; $i++) {
+ my $lname = $LooseFiles[$i]->{name};
+ print "Collecting revisions for $lname\n" if $Verbose > 1;
+ for (my $lnre = qr/^$lname/;
+ $i < @LooseFiles && $LooseFiles[$i]->{name} =~ $lnre;
+ $i++) {
+ print " - $LooseFiles[$i]->{name}\n" if $Verbose > 1;
+ push @{$Bins{$lname}}, $LooseFiles[$i];
+ }
+ $i-- if $i < @LooseFiles;
+ }
+
+ foreach my $bin (keys %Bins)
+ { $_->{logical} = $bin for @{$Bins{$bin}} }
+}
+
+# Sort the file list by timestamp; this places the records into order for
+# the import process.
+@LooseFiles = sort { $a->{timestamp} <=> $b->{timestamp} } @LooseFiles;
+
+store_revision_in_git $_ for @LooseFiles;
+
+__END__