From b7c0ad145f3601aed431c7f262ce39deeae3a535 Mon Sep 17 00:00:00 2001 From: Jacob Bachmeyer Date: Thu, 30 Jul 2020 21:43:52 -0500 Subject: [PATCH] Mark imported files This commit exists as a place to hang the tag for the result of the import work and to carry a copy of the tools used for future reference. The import was performed using a tarball provided by Ian Kelling and the files therein. Most files imported had only one latest version, but the upload-ftp-*.pl tools had numerous versions recorded with various suffixes denoting them as old backup copies. This import replaced a previous import effort that seems to have gotten the history order wrong. The two histories have almost the same set of blobs; the differences were due to slightly different files in the tarball used as the source of this import and the sources used for the previous import and a deliberate revision to the maintainers.bypkg.example file to indicate that it contains only public information and to change an email address as preferred by that person. --- git-check-import-branch.pl | 198 ++++++++++++++++++++++++++ repo-import-loose-files.pl | 282 +++++++++++++++++++++++++++++++++++++ 2 files changed, 480 insertions(+) create mode 100755 git-check-import-branch.pl create mode 100755 repo-import-loose-files.pl diff --git a/git-check-import-branch.pl b/git-check-import-branch.pl new file mode 100755 index 0000000..ba4d947 --- /dev/null +++ b/git-check-import-branch.pl @@ -0,0 +1,198 @@ +#!/usr/bin/perl + +# Copyright (C) 2020 Jacob Bachmeyer +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +git-check-import-branch.pl - Verify that a reimport loaded the same blobs + +=head1 SYNOPSIS + + git-check-import-branch.pl + + Options: + -h --help print brief help message + -v --verbose print additional information while running + +=head1 OPTIONS + +=over + +=item B<--help> B<-h> + +Print brief help message and exit. + +=item B<--verbose> B<-v> + +Print additional information while running. Repeating this option further +increases verbosity. + +=back + +=head1 DESCRIPTION + +This tool searches the histories of two branches in a Git repository and +reports any differences in the set of blobs referenced between them. + +=cut + +use strict; +use warnings; + +use Getopt::Long; +use Pod::Usage; + +my %OPT = (); +my $Verbose = 0; + +GetOptions('help|h' => \$OPT{help}, + 'verbose|v+' => \$Verbose, + ) or pod2usage(2); +pod2usage(1) if $OPT{help}; + +pod2usage(2) unless @ARGV == 2; + +### +### Git reader routines +### + +# read a commit; return hashref with keys: +# id Git commit id +# tree tree object id for this commit +# parents array of parents of this commit +sub read_commit { + my $commit_id = shift; + + my $commit = { id => qx/git rev-parse $commit_id/ }; + chomp $commit->{id}; + die "failed to find commit id $commit_id" unless $? == 0; + + open my $git_cat, '-|', qw/git cat-file commit/, $commit_id + or die "git cat-file commit $commit_id: $!"; + print " read commit $commit->{id}\n" if $Verbose > 1; + while (<$git_cat>) { + chomp; + last if m/^$/; + if (m/^parent\s+([[:xdigit:]]{40})$/) { + push @{$commit->{parents}}, $1; + } elsif (m/^tree\s+([[:xdigit:]]{40})$/) { + die "multiple trees in commit $commit_id" if $commit->{tree}; + $commit->{tree} = $1; + } + } + close $git_cat; + + return $commit; +} + +# read a tree; return arrayref of blob hex SHA1 values and arrayref of +# subtree ids as hex SHA1 values +sub read_tree { + my $tree_id = shift; + + my @blobs = (); + my @trees = (); + + open my $git_cat, '-|', qw/git cat-file -p/, $tree_id + or die "git cat-file -p $tree_id: $!"; + print " read tree $tree_id\n" if $Verbose > 1; + while (<$git_cat>) { + chomp; + m/^([0-7]{2})([0-7]{4})\s+(blob|tree)\s+([[:xdigit:]]{40})\s+(.*)$/ + or die "failed to parse tree listing line: $_"; + my $ftype = $1; my $mode = $2; my $obtype = $3; my $obid = $4; + my $name = $5; + print " $_\n" if $Verbose > 2; + if ($obtype eq 'blob') { + push @blobs, $obid; + } elsif ($obtype eq 'tree') { + push @trees, $obid; + } else { die "unknown Git object type in tree: $obtype" } + } + close $git_cat; + + return \@blobs, \@trees; +} + +# read a tree recursively; collecting all blob hex SHA1 values +sub read_subtree { + my $top_id = shift; + + my %blobs = (); + my %trees = (); + + my @queue = ($top_id); + my $blobs; my $trees; + while (@queue) { + $trees{$queue[0]}++; + ($blobs, $trees) = read_tree shift @queue; + $blobs{$_}++ for @$blobs; + push @queue, grep !$trees{$_}, @$trees; + } + + return keys %blobs; +} + +# read history; return sorted list of blob hex SHA1 values +sub read_history { + my $tip = shift; + + my %blobs = (); + my %commits = (); + + print "Reading history for $tip...\n" if $Verbose; + my @queue = read_commit $tip; + while (my $commit = shift @queue) { + $commits{$commit->{id}}++; + $blobs{$_}++ for read_subtree $commit->{tree}; + push @queue, map {read_commit $_} grep !$commits{$_}, @{$commit->{parents}}; + } + + return sort keys %blobs; +} + +### +### Collection and list diff +### + +my @A_Blobs = read_history $ARGV[0]; +my @B_Blobs = read_history $ARGV[1]; + +print "Analyzing..." if $Verbose; + +my %A_Blobs = map { $_ => 1 } @A_Blobs; +my %B_Blobs = map { $_ => 1 } @B_Blobs; + +my @Only_A = grep !$B_Blobs{$_}, @A_Blobs; +my @Only_B = grep !$A_Blobs{$_}, @B_Blobs; + +print "done\n\n" if $Verbose; + +if (@Only_A) { + print "Blobs found only in $ARGV[0]:\n"; + print " $_\n" for @Only_A; +} +if (@Only_B) { + print "Blobs found only in $ARGV[1]:\n"; + print " $_\n" for @Only_B; +} + +if (!@Only_A && !@Only_B) { + my $count = scalar @A_Blobs; + print "Congratulations! All $count blobs are in both histories.\n"; +} + +__END__ diff --git a/repo-import-loose-files.pl b/repo-import-loose-files.pl new file mode 100755 index 0000000..3ba38ce --- /dev/null +++ b/repo-import-loose-files.pl @@ -0,0 +1,282 @@ +#!/usr/bin/perl + +# Copyright (C) 2020 Jacob Bachmeyer +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +=head1 NAME + +repo-import-loose-files - Import loose files into revision control + +=head1 SYNOPSIS + + repo-import-loose-files.pl [options] ... + + Options: + -h --help print brief help message + -v --verbose print additional information while running + -n --dry-run print commands but do not execute + --author=AUTHOR specify author for generated commits + --repo-base=BASE repository base directory (if not tarball root) + --tar-listing=FILE load timestamps from a file listing + +=head1 OPTIONS + +=over + +=item B<--help> B<-h> + +Print brief help message and exit. + +=item B<--verbose> B<-v> + +Print additonal information while running. Repeating this option further +increases verbosity. + +=item B<--dry-run> B<-n> + +Print the commands that would be executed but do not actually execute them. + +=item B<--author=>I + +Specify notional author of the imported commits. + +=item B<--repo-base=>I + +Relative file name from current directory to repository root directory. If +not given, the repository is assumed to be in the current directory, and +the root of the tarball is assumed to be the root of the repository. + +=item B<--tar-listing=>I + +Read the provided file (as procduced by the -vt option to GNU tar) instead +of searching the directory tree. The tarball should have been unpacked in +the directory given as the root of the file tree. + +=back + +=head1 DESCRIPTION + +This tool imports a collection of loose files into a Git repository, +assuming that any subsets of the files whose names share a common prefix +are individual files that have been tracked over time by accumulating +editor backup files instead of using revision control software. + +Any suffix is acceptable, provided that the suffixed files are +distinguishable from the primary files. The actual import ordering is +determined by the files' timestamps. + +=head1 CAVEATS + +This program assumes that the "logical" files are the shortest prefixes +that are both shared across multiple files and present as files under their +own names. + +=cut + +use strict; +use warnings; + +use File::Find; +use File::Spec; +use Date::Parse; +use Date::Format; + +use Getopt::Long; +use Pod::Usage; + +my %OPT = (); +my $DryRun = 0; +my $Verbose = 0; + +GetOptions('help|h' => \$OPT{help}, + 'dry-run|n' => \$DryRun, + 'verbose|v+' => \$Verbose, + 'author=s' => \$OPT{author}, + 'repo-base=s' => \$OPT{repobase}, + 'tar-listing=s@' => \$OPT{listing}, + ) or pod2usage(2); +pod2usage(1) if $OPT{help}; + +unless (@ARGV) { + warn "No directory given.\n"; + pod2usage(2); +} + +# Each file record used in this program is a Perl hashref containing: +# name relative name of file (as from tarball) +# logical relative name of file (base file to be versioned) +# size size of file in bytes +# timestamp timestamp of file +# link_to if set, file is symlink to this destination +# source actual name of source file in filesystem + +sub cmd { # run a command if not in dry-run mode + my @show = map { m/[[:space:]]/ ? qq["$_"] : $_ } @_; + if ($DryRun) { + print join(' ', @show),"\n"; + } elsif (my $pid = fork) { # in parent + print join(' ', @show),"\n" if $Verbose; + waitpid $pid, 0; + } else { # in child + chdir $OPT{repobase} if $OPT{repobase}; + exec {$_[0]} @_; + } +} + +### +### Revision import routines +### + +sub store_revision_in_git { + my $file = shift; + + my $date = time2str('%Y-%m-%dT%H:%M:%S', $file->{timestamp}); + my @commit_opts = ('--date='.$date); + + push @commit_opts, '--author='.$OPT{author} if $OPT{author}; + + my $source = $file->{source}; + my $target = $file->{logical}; + + if ($OPT{repobase}) { + $source = File::Spec->abs2rel($source, $OPT{repobase}); + $target = File::Spec->abs2rel($target, $OPT{repobase}); + } + + my $message = 'Import '; + $message .= $file->{link_to} ? 'symlink' : 'version'; + $message .= ' as of '.time2str('%Y-%m-%d', $file->{timestamp}); + $message .= ' for '.$target; + push @commit_opts, '-m', $message; + + cmd qw/cp -fp/, ($Verbose ? '-v' : ()), $source, $target; + cmd qw/git add --/, $target; + cmd qw/git commit/, @commit_opts, '--', $target; +} + +### +### File collection and processing +### + +my @LooseFiles = (); +my %LooseFiles = (); # same objects as @LooseFiles, indexed by relative name +my $Have_Symlinks = 0; + +# Collect the files present in the import areas. +print "Collecting files...\n" if $Verbose; +foreach my $dir (@ARGV) { + find({no_chdir => 1, + wanted => sub { + print "Found $_\n" if $Verbose > 2; + $Have_Symlinks = 1 if -l; + my $keyname = File::Spec->abs2rel($_, $dir); + unless (-d _) { + my $item = {name => $keyname, + timestamp => (stat(_))[9], + size => (stat(_))[7], + -l _ ? (link_to => readlink) : (), + source => $_}; + push @LooseFiles, $item; + die "duplicate relative name $keyname" + if exists $LooseFiles{$keyname}; + $LooseFiles{$keyname} = $item; + } + }, + }, $dir); +} + +die "Symlink timestamps are not reliably restored on the filesystem;\n" + ." use the --tar-listing option to fix them.\n" + if $Have_Symlinks && !$OPT{listing}; + +# Correct timestamps by reading a tarball listing if given. +foreach my $listing (@{$OPT{listing}}) { + print "Fixing timestamps from $listing...\n" if $Verbose; + my $ts_re = qr/\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}/; + open LISTING, '<', $listing + or die "read file listing: open $listing: $!"; + + while () { + chomp; + die "read file listing $listing: failed parsing line:\n $_" unless + m<(.).{9}\s+[/[:alnum:]]+\s+(\d+)\s+($ts_re)\s+(.*)$>; + # $1 -> $type -- type: - => file, d => directory, l => symlink + # $2 -> $size -- size in bytes + # $3 -> $ts -- timestamp in YYYY-mm-dd HH:MM:SS + # $4 -> $name -- relative name + my $type = $1; my $size = $2; my $ts = $3; my $name = $4; + if ($type eq '-') { # process file + die "file size mismatch between file and listing" + unless $LooseFiles{$name}->{size} == $2; + my $timestamp = str2time($ts); + unless ($LooseFiles{$name}->{timestamp} == $timestamp) { + $LooseFiles{$name}->{timestamp} = $timestamp; + print 'Fixed file timestamp ',$LooseFiles{$name}->{timestamp}, + ' -> ',$timestamp,' on ',$name,"\n"; + } + } elsif ($type eq 'l') { # process symlink + my $lname; my $target; + ($lname, $target) = ($name =~ m/^(.*)\s+->\s+(.*)$/); + # tar reports symlink sizes as 0, but check target + die "link target mismatch for $lname" + unless $LooseFiles{$lname}->{link_to} eq $target; + my $timestamp = str2time($ts); + unless ($LooseFiles{$lname}->{timestamp} == $timestamp) { + $LooseFiles{$lname}->{timestamp} = $timestamp; + print 'Fixed symlink timestamp ',$LooseFiles{$lname}->{timestamp}, + ' -> ',$timestamp,"\n on $name\n"; + } + } elsif ($type eq 'd') { # skip directory + next; + } else { die "item type $type unknown in $_" } + } + + close LISTING or die "read file listing: close $listing: $!"; +} + +# Sort the file list by name; this places each "primary" file ahead of its +# old versions. +@LooseFiles = sort { $a->{name} cmp $b->{name} } @LooseFiles; + +# Assign logical names to the collected files. +{ + # indexed by relative file name + my %Bins = (); + + # Group the files by their logical names. + print "Collecting files into groups...\n" if $Verbose; + for (my $i = 0; $i < @LooseFiles; $i++) { + my $lname = $LooseFiles[$i]->{name}; + print "Collecting revisions for $lname\n" if $Verbose > 1; + for (my $lnre = qr/^$lname/; + $i < @LooseFiles && $LooseFiles[$i]->{name} =~ $lnre; + $i++) { + print " - $LooseFiles[$i]->{name}\n" if $Verbose > 1; + push @{$Bins{$lname}}, $LooseFiles[$i]; + } + $i-- if $i < @LooseFiles; + } + + foreach my $bin (keys %Bins) + { $_->{logical} = $bin for @{$Bins{$bin}} } +} + +# Sort the file list by timestamp; this places the records into order for +# the import process. +@LooseFiles = sort { $a->{timestamp} <=> $b->{timestamp} } @LooseFiles; + +store_revision_in_git $_ for @LooseFiles; + +__END__ -- 2.25.1