--- /dev/null
+#!/usr/bin/perl
+
+# Copyright (C) 2020 Jacob Bachmeyer
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+=head1 NAME
+
+git-check-import-branch.pl - Verify that a reimport loaded the same blobs
+
+=head1 SYNOPSIS
+
+ git-check-import-branch.pl <old import tip> <new import tip>
+
+ Options:
+ -h --help print brief help message
+ -v --verbose print additional information while running
+
+=head1 OPTIONS
+
+=over
+
+=item B<--help> B<-h>
+
+Print brief help message and exit.
+
+=item B<--verbose> B<-v>
+
+Print additional information while running. Repeating this option further
+increases verbosity.
+
+=back
+
+=head1 DESCRIPTION
+
+This tool searches the histories of two branches in a Git repository and
+reports any differences in the set of blobs referenced between them.
+
+=cut
+
+use strict;
+use warnings;
+
+use Getopt::Long;
+use Pod::Usage;
+
+my %OPT = ();
+my $Verbose = 0;
+
+GetOptions('help|h' => \$OPT{help},
+ 'verbose|v+' => \$Verbose,
+ ) or pod2usage(2);
+pod2usage(1) if $OPT{help};
+
+pod2usage(2) unless @ARGV == 2;
+
+###
+### Git reader routines
+###
+
+# read a commit; return hashref with keys:
+# id Git commit id
+# tree tree object id for this commit
+# parents array of parents of this commit
+sub read_commit {
+ my $commit_id = shift;
+
+ my $commit = { id => qx/git rev-parse $commit_id/ };
+ chomp $commit->{id};
+ die "failed to find commit id $commit_id" unless $? == 0;
+
+ open my $git_cat, '-|', qw/git cat-file commit/, $commit_id
+ or die "git cat-file commit $commit_id: $!";
+ print " read commit $commit->{id}\n" if $Verbose > 1;
+ while (<$git_cat>) {
+ chomp;
+ last if m/^$/;
+ if (m/^parent\s+([[:xdigit:]]{40})$/) {
+ push @{$commit->{parents}}, $1;
+ } elsif (m/^tree\s+([[:xdigit:]]{40})$/) {
+ die "multiple trees in commit $commit_id" if $commit->{tree};
+ $commit->{tree} = $1;
+ }
+ }
+ close $git_cat;
+
+ return $commit;
+}
+
+# read a tree; return arrayref of blob hex SHA1 values and arrayref of
+# subtree ids as hex SHA1 values
+sub read_tree {
+ my $tree_id = shift;
+
+ my @blobs = ();
+ my @trees = ();
+
+ open my $git_cat, '-|', qw/git cat-file -p/, $tree_id
+ or die "git cat-file -p $tree_id: $!";
+ print " read tree $tree_id\n" if $Verbose > 1;
+ while (<$git_cat>) {
+ chomp;
+ m/^([0-7]{2})([0-7]{4})\s+(blob|tree)\s+([[:xdigit:]]{40})\s+(.*)$/
+ or die "failed to parse tree listing line: $_";
+ my $ftype = $1; my $mode = $2; my $obtype = $3; my $obid = $4;
+ my $name = $5;
+ print " $_\n" if $Verbose > 2;
+ if ($obtype eq 'blob') {
+ push @blobs, $obid;
+ } elsif ($obtype eq 'tree') {
+ push @trees, $obid;
+ } else { die "unknown Git object type in tree: $obtype" }
+ }
+ close $git_cat;
+
+ return \@blobs, \@trees;
+}
+
+# read a tree recursively; collecting all blob hex SHA1 values
+sub read_subtree {
+ my $top_id = shift;
+
+ my %blobs = ();
+ my %trees = ();
+
+ my @queue = ($top_id);
+ my $blobs; my $trees;
+ while (@queue) {
+ $trees{$queue[0]}++;
+ ($blobs, $trees) = read_tree shift @queue;
+ $blobs{$_}++ for @$blobs;
+ push @queue, grep !$trees{$_}, @$trees;
+ }
+
+ return keys %blobs;
+}
+
+# read history; return sorted list of blob hex SHA1 values
+sub read_history {
+ my $tip = shift;
+
+ my %blobs = ();
+ my %commits = ();
+
+ print "Reading history for $tip...\n" if $Verbose;
+ my @queue = read_commit $tip;
+ while (my $commit = shift @queue) {
+ $commits{$commit->{id}}++;
+ $blobs{$_}++ for read_subtree $commit->{tree};
+ push @queue, map {read_commit $_} grep !$commits{$_}, @{$commit->{parents}};
+ }
+
+ return sort keys %blobs;
+}
+
+###
+### Collection and list diff
+###
+
+my @A_Blobs = read_history $ARGV[0];
+my @B_Blobs = read_history $ARGV[1];
+
+print "Analyzing..." if $Verbose;
+
+my %A_Blobs = map { $_ => 1 } @A_Blobs;
+my %B_Blobs = map { $_ => 1 } @B_Blobs;
+
+my @Only_A = grep !$B_Blobs{$_}, @A_Blobs;
+my @Only_B = grep !$A_Blobs{$_}, @B_Blobs;
+
+print "done\n\n" if $Verbose;
+
+if (@Only_A) {
+ print "Blobs found only in $ARGV[0]:\n";
+ print " $_\n" for @Only_A;
+}
+if (@Only_B) {
+ print "Blobs found only in $ARGV[1]:\n";
+ print " $_\n" for @Only_B;
+}
+
+if (!@Only_A && !@Only_B) {
+ my $count = scalar @A_Blobs;
+ print "Congratulations! All $count blobs are in both histories.\n";
+}
+
+__END__