#!/usr/bin/perl

# IBID - Incremental Backups to Infinite Disk
# Copyright 2006-2010 Bob Glickstein <bobg@emphatic.com>
# Distributed under the terms of the GNU General Public License, version 2

package main;

use strict;
use warnings;

use Data::Dumper;
use File::Basename;
use File::Copy;
use File::Path;
use File::stat;
use Getopt::Long qw(:config bundling);
use IO::Dir;
use IO::File;
use Pod::Usage;
use POSIX ();

my $VERSION = '$Revision: 1.52 $';
$VERSION =~ s/^\D*\d+\D+(\d+).*/$1/;

our $verbose = 0;
my $ibid_dir = "$ENV{HOME}/.ibid";
my $not_really;
my $preserve_mode = 1;
my $preserve_owner = 1;
my $restore_atime = 1;
my $preserve_time;
my $recheck_mtime = 1;
my $ensure;
my $session_size_limit;
my $do_version;
my $session_files_limit;
my $prune_sessions;
my $single_file_size_limit;
my $trim_report;
my $check_names;

my $dump;

GetOptions('version|V!'                 => \$do_version,
           'verbose|v+'                 => \$verbose,
           'dump|D!'                    => \$dump,
           'no|n!'                      => \$not_really,
           'dir|d=s'                    => \$ibid_dir,
           'check-names!'               => \$check_names,
           'preserve-mode|m!'           => \$preserve_mode,
           'preserve-owner|o!'          => \$preserve_owner,
           'restore-atime|a!'           => \$restore_atime,
           'preserve-time|t!'           => \$preserve_time,
           'recheck-mtime!'             => \$recheck_mtime,
           'ensure|e!'                  => \$ensure,
           'session-size-limit|S=s'     => \$session_size_limit,
           'session-files-limit|F=i'    => \$session_files_limit,
           'single-file-size-limit|s=s' => \$single_file_size_limit,
           'prune-sessions=i'           => \$prune_sessions,
           'trim-report!'               => \$trim_report);

if ($do_version) {
  print "Ibid version $VERSION\nCopyright (c) 2006-2010 Bob Glickstein\n";
  exit(0);
}

my $fileset = shift;
pod2usage(1)
    unless (defined($fileset) && ($fileset =~ /\S/));

my $fileset_dir = "$ibid_dir/$fileset";
my $sessions_dir = "$fileset_dir/sessions";
mkpath($sessions_dir);

our $path_max = POSIX::pathconf($sessions_dir, POSIX::_PC_PATH_MAX());

my($session_file, $prev_session, $compression);
do {
  my $dh = new IO::Dir($sessions_dir)
      or die "Cannot opendir \"$sessions_dir\" ($!)\n";
  while (defined(my $entry = $dh->read())) {
    if ($entry =~ /^\Q$fileset\E\.(\d+)(\.(gz|bz2|xz))?$/) {
      my $num = int($1);        # force it to be a number
      my $type = $3;
      if (!defined($prev_session) || ($num > $prev_session)) {
        $prev_session = $num;
        $session_file = "$sessions_dir/$entry";
        $compression = $type;
      }
    }
  }
  $dh->close();
};

my($record, $session);
if (defined($session_file)) {
  my $fh;
  if ($compression) {
    my $cmd;
    if ($compression eq 'gz') {
      $cmd = "gzip -dc $session_file";
    } elsif ($compression eq 'bz2') {
      $cmd = "bzip2 -dc $session_file";
    } elsif ($compression eq 'xz') {
      $cmd = "xz -dc $session_file";
    } else {
      die "Unknown compression type \"$compression\" for \"$session_file\"\n";
    }
    open($fh, "$cmd |")
        or die "Cannot read from \"$cmd\" ($!)\n";
  } else {
    $fh = new IO::File("<$session_file")
        or die "Cannot open \"$session_file\" ($!)\n";
  }

  $record = load Record($fh)
      or die "Cannot retrieve session from $session_file ($!)\n";

  $fh->close();

  if ($dump) {
    my @regexes = @ARGV;

    printf("Fileset: %s\n", $record->fileset());
    printf("Session: %d\n", $record->session());
    if ($record->version() >= 2) {
      printf("Started: %s\n", scalar(localtime($record->session_start_time())));
      printf("Finished: %s\n", scalar(localtime($record->session_end_time())));
    }
    print "\n";
    $record->foreach_name(sub {
      my $name = shift;

      if (@regexes) {
        my $found;
        foreach my $regex (@regexes) {
          if ($name =~ /$regex/i) {
            $found = 1;
            last;
          }
        }
        return unless $found;
      }
      print "$name\n";

      my $file_cb = sub {
        my($name_index, $session, $ensured, $dev, $ino, $size, $mtime) = @_;

        printf("  %d: %s", $session, scalar(localtime($mtime)));
        printf(" (%d)", $size) if defined($size);
        if ($verbose && $dev && $ino) {
          printf(" <%d:%d>", $dev, $ino);
        }
        print " E" if $ensured;
        print "\n";
      };

      my $empty_cb = sub {
        my($name_index, $session, $dev, $ino, $mtime) = @_;

        printf("  %d: Z\n", $session);
      };

      my $update_cb = sub {
        my($name_index, $session, $dev, $ino, $hist_index) = @_;

        if ($verbose) {
          printf("  %d: U <%d:%d>\n", $session, $dev, $ino);
        }
      };

      my $symlink_cb = sub {
        my($name_index, $session, $target) = @_;
        printf("  %d: S-> %s\n", $session, $target);
      };

      my $hardlink_cb = sub {
        my($name_index, $session,
           $hname_index, $hsession,
           $hensured, $hdev, $hino, $hsize, $hmtime) = @_;
        printf("  %d: H-> %s\n",
               $session, $record->index_to_name($hname_index));
        printf("    [%d: %s", $hsession, scalar(localtime($hmtime)));
        printf(" (%d)", $hsize) if defined($hsize);
        print " E" if $hensured;
        print "]\n";
      };

      $record->foreach_name_history($name, $file_cb, $hardlink_cb, $symlink_cb, $empty_cb, $update_cb);
    });

    exit(0);
  }

  $session = $prev_session + 1;

  warn "Retrieved session $prev_session of $fileset\n";
} elsif ($dump) {
  die "No sessions of fileset \"$fileset\" to dump\n";
} else {
  $session = 1;
  $record = new Record($fileset);
  warn "New fileset\n";
}

my $archive_root = shift;
pod2usage(1)
    unless (defined($archive_root) && ($archive_root =~ /\S/));

if (defined($session_size_limit)) {
  $session_size_limit = interpret_size($session_size_limit);
}

if (defined($single_file_size_limit)) {
  $single_file_size_limit = interpret_size($single_file_size_limit);
}

my @inclusions;
do {
  my $inclusion_file = "$fileset_dir/INCLUDE";
  my $fh = new IO::File("<$inclusion_file")
      or die "Cannot open \"$inclusion_file\" ($!)\n";
  while (defined($_ = $fh->getline())) {
    chomp;
    push(@inclusions, $_);
  }
  $fh->close();
};

my @exclusions;
do {
  my $exclusion_file = "$fileset_dir/EXCLUDE";
  if (-f $exclusion_file) {
    my $fh = new IO::File("<$exclusion_file")
        or die "Cannot open \"$exclusion_file\" ($!)\n";
    while (defined($_ = $fh->getline())) {
      chomp;
      push(@exclusions, $_);
    }
    $fh->close();
  }
};

my %devmap;
my $devmap_file = "$ibid_dir/.devmap";
if (-f $devmap_file) {
  my $fh = new IO::File("<$devmap_file")
      or die "Cannot open \"$devmap_file\" ($!)\n";
  while (defined($_ = $fh->getline())) {
    my @devs = /(\d+)/g;
    for (my $i = 0; $i < @devs; ++$i) {
      my $di = $devs[$i];
      $devmap{$di} = [];
      for (my $j = 0; $j < @devs; ++$j) {
        next if ($i == $j);
        my $dj = $devs[$j];
        push(@{$devmap{$di}}, $dj);
      }
    }
  }
  $fh->close();
}

my $session_root = session_root($session);
our $changes = 0;
my $bytes = 0;
my $files = 0;

warn "Beginning session $session\n";

my $start_time = time;

my %processed;

foreach my $inclusion (@inclusions) {
  process($inclusion);
  if (defined($session_size_limit) && ($bytes >= $session_size_limit)) {
    warn "Reached session size limit\n";
    last;
  }
  if (defined($session_files_limit) && ($files >= $session_files_limit)) {
    warn "Reached session files limit\n";
    last;
  }
}

if ($changes) {
  $record->set_fileset($fileset);
  $record->set_session_times($session, $start_time, time);
  my $new_session_file = "$sessions_dir/$fileset.$session";

  warn "Storing session to \"$new_session_file\"\n"
      if $verbose;

  unless ($not_really) {
    $record->store($new_session_file)
        or die "Could not store session to \"$new_session_file\" ($!)\n";

    if ($compression) {
      my @cmd;
      if ($compression eq 'gz') {
        @cmd = ('gzip');
      } elsif ($compression eq 'bz2') {
        @cmd = ('bzip2');
      } elsif ($compression eq 'xz') {
        @cmd = ('xz');
      }
      if (@cmd) {
        if ($verbose) {
          push(@cmd, '-v');
        }
        if (system(@cmd, $new_session_file)) {
          warn "Could not compress \"$new_session_file\" with \"@cmd\" ($!)\n";
        } else {
          $new_session_file .= ".$compression";
        }
      }
    }

    mkpath($session_root);
    if (copy($new_session_file, $session_root)) {
      if (defined($prune_sessions) && ($prune_sessions >= 1)) {
        if (defined(my $dh = new IO::Dir($sessions_dir))) {
          my @session_files;
          while (defined(my $entry = $dh->read())) {
            if ($entry =~ /^\Q$fileset\E\.(\d+)(\.(gz|bz2|xz))?$/) {
              push(@session_files, [$1, $entry]);
            }
          }
          $dh->close();
          if (@session_files > $prune_sessions) {
            @session_files = sort { $a->[0] <=> $b->[0] } @session_files;
            splice(@session_files, -$prune_sessions);
            @session_files = map { $_->[1] } @session_files;
            warn sprintf("Pruning these session files:\n  %s\n",
                         join(' ', @session_files));
            @session_files = map { "$sessions_dir/$_" } @session_files;
            unlink @session_files;
          }
        } else {
          warn "Could not opendir \"$sessions_dir\" for prune-sessions ($!)\n";
        }
      }
    } else {
      warn "Could not copy session file \"$new_session_file\" to \"$session_root\" ($!)\n";
    }

  }
  warn "Finished, copied $bytes byte(s)\n";
} else {
  warn "No changes\n";
}

if ($trim_report) {
  print "### Trim report\n";

  $record->foreach_name(sub {
    my $name = shift;

    foreach my $exclusion (@exclusions) {
      if ($name =~ /$exclusion/) {
        my $excluded_file_cb = sub {
          my($name_index, $session, $ensured, $dev, $ino, $size, $mtime) = @_;

          print "## Excluded by rule $exclusion:\n";
          print session_file($session, $name), "\n";
        };
        $record->foreach_name_history($name, $excluded_file_cb);
        return;
      }
    }

    my @copies;

    my $devmap_file_cb = sub {
      my($name_index, $session, $ensured, $dev, $ino, $size, $mtime) = @_;

      foreach my $copy (@copies) {
        my($old_session, $old_dev, $old_ino, $old_size, $old_mtime) = @$copy;

        if (($dev != $old_dev)
            && equivalent_devs($dev, $old_dev)
            && ($ino == $old_ino)
            && ($size == $old_size)
            && ($mtime == $old_mtime)) {
          printf("## Copied as %s (dev %d, inode %d) before device number change to %d:\n",
                 session_file($old_session, $name),
                 $old_dev, $old_ino, $dev);
          print session_file($session, $name), "\n";
          return;
        }
      }
      push(@copies, [$session, $dev, $ino, $size, $mtime]);
    };

    $record->foreach_name_history($name, $devmap_file_cb);
  });
}

################################################################

sub process {
  my($full, $device) = @_;

  if ($processed{$full}) {
    warn "Already visited \"$full\"\n" if ($verbose > 1);
    return;
  }

  $processed{$full} = 1;

  my $is_link = (-l $full);
  my $is_dir = (!$is_link && (-d $full));

  foreach my $exclusion (@exclusions) {
    if ($exclusion =~ m|/$|) {
      if ($is_dir && ("$full/" =~ /$exclusion$/)) {
        warn "X $full\n" if $verbose > 1;
        return;
      }
    } elsif ($full =~ /$exclusion/) {
      warn "x $full\n" if $verbose > 1;
      return;
    }
  }

  my $st = lstat($full);
  if (!defined($st)) {
    warn "Could not lstat \"$full\" ($!)\n";
    return;
  }

  if (defined($device) && ($st->dev() != $device)) {
    return;                     # do not cross device boundaries
  }

  if ($is_link) {
    my $target = readlink($full);
    if (!defined($target)) {
      warn "Could not readlink \"$full\" ($!)\n";
      return;
    }
    my $prev_target = $record->lookup_symlink($full);
    if (!defined($prev_target) || ($prev_target ne $target)) {
      warn "S $full -> $target\n" if $verbose;
      $record->new_symlink($full, $target);
    }
  } elsif ($is_dir) {
    my $dh = new IO::Dir($full);
    if (!defined($dh)) {
      warn "Could not opendir \"$full\" ($!)\n";
      return;
    }
    my @entries;
    while (defined(my $entry = $dh->read())) {
      next if ($entry eq '.' || $entry eq '..');
      push(@entries, $entry);
    }
    $dh->close();
    foreach my $entry (@entries) {
      process("$full/$entry", $st->dev());
      return if (defined($session_size_limit) && ($bytes >= $session_size_limit));
      return if (defined($session_files_limit) && ($files >= $session_files_limit));
    }
  } elsif (-f $full) {
    my $dev = $st->dev();
    my $ino = $st->ino();
    my $mtime = $st->mtime();
    my $size = $st->size();

    my @other_devs;
    if (exists($devmap{$dev})) {
      @other_devs = @{$devmap{$dev}};
    }

    my($hist_index, $hist_entry) = $record->latest_for_inode($dev, $ino, @other_devs);
    my $was_name_checked;
    if (!$hist_entry && $check_names) {
      ($hist_index, $hist_entry) = $record->latest_for_name($full, 1);
      $was_name_checked = defined($hist_entry);
    }

    # "last" info (last time the file was recorded, NOT as an UPDATE)
    my($ltype, $lname_index, $lsession, 
       $ensured, $ldev, $lino, $lsize, $lmtime);

    # "base" info (after following UPDATEs back to a FILE or an EMPTY)
    my($bsession, $bmtime, $bsize);

    # which history entry to "ensure"
    my $ehist_index;

    if ($hist_entry) {
      my @lrest;
      ($ltype, $lname_index, $lsession, @lrest) = @$hist_entry;
      $bsession = $lsession;
      if ($ltype == Record::FILE()) {
        ($ensured, $ldev, $lino, $lsize, $lmtime) = @lrest;
        $ehist_index = $hist_index;
      } elsif ($ltype == Record::EMPTY()) {
        ($ldev, $lino, $lmtime) = @lrest;
        ($ensured, $lsize) = (1, 0);
      } elsif (($ltype == Record::UPDATE())
               || ($ltype == Record::HARDLINK())) {
        my $base_hist_index;
        if ($ltype == Record::UPDATE()) {
          ($ldev, $lino, $base_hist_index) = @lrest;
        } else {
          ($base_hist_index) = @lrest;
        }
        my $base_hist_entry = $record->history_entry_at_index($base_hist_index);
        while (1) {
          my($btype, $bname_index, @brest);
          ($btype, $bname_index, $bsession, @brest) = @$base_hist_entry;
          if ($btype == Record::UPDATE()) {
            my($bdev, $bino, $bhist_index) = @brest;
            $base_hist_entry = $record->history_entry_at_index($bhist_index);
            next;
          }
          if ($btype == Record::HARDLINK()) {
            my($bhist_index) = @brest;
            $base_hist_entry = $record->history_entry_at_index($bhist_index);
            next;
          }
          if ($btype == Record::EMPTY()) {
            ($ensured, $lsize) = (1, 0);
          } elsif ($btype == Record::FILE()) {
            my($bdev, $bino);
            ($ensured, $bdev, $bino, $bsize, $bmtime) = @brest;
            $ehist_index = $base_hist_index;
          }
          last;
        }
      }

      $bmtime = $lmtime unless defined($bmtime);
      $bsize = $lsize unless defined($bsize);
    }

    my $do_copy = (!defined($bmtime) || ($bmtime < $mtime));
    $do_copy ||= (defined($bsize) && ($size != $bsize));

    my $too_big;
    if ($do_copy
        && defined($single_file_size_limit)
        && ($size > $single_file_size_limit)) {
      undef $do_copy;
      $too_big = 1;
    }

    if (!$do_copy && $ensure && $lname_index && !$ensured && $ehist_index) {
      my $lname = $record->index_to_name($lname_index);
      $lname = session_file($bsession, $lname);

      if (-f $lname) {
        if (defined($bsize)) {
          my $asize = (-s $lname);
          if ($asize != $bsize) {
            $do_copy = 1;
            warn "Ensure: \"$full\" has wrong size in session $bsession ($asize bytes, should be $bsize)\n";
          }
        }
      } else {
        $do_copy = 1;
        warn "Ensure: \"$full\" missing from session $bsession\n";
      }
      if (!$do_copy) {
        $record->set_ensured($ehist_index);
      }
    }

    if ($do_copy && ($size > 0)) {
      if ($verbose > 1) {
        warn sprintf("C %s <%d:%d> (%d)\n", $full, $dev, $ino, $st->size());
      } elsif ($verbose) {
        warn sprintf("C %s (%d)\n", $full, $st->size());
      }
      my $target = "$session_root$full";
      my $target_dir = dirname($target);
      my $result;
      my $atime = $st->atime();
      if ($not_really) {
        $result = 1;
      } else {
        mkpath($target_dir);
        $result = copy($full, $target);
        if ($restore_atime) {
          if (utime($atime, $mtime, $full)) {
            if ($recheck_mtime) {
              if (defined(my $new_st = lstat($full))) {
                if ($new_st->mtime() != $mtime) {
                  die sprintf("Old mtime %d of \"%s\" does not match new mtime %d (%+d)\n",
                              $mtime, $full, $new_st->mtime(),
                              $new_st->mtime() - $mtime);
                }
              } else {
                warn "Could not recheck mtime of \"$full\" ($!)\n";
              }
            }
          } else {
            warn "Could not restore atime of \"$full\" to $atime ($!)\n";
          }
        }
      }
      if ($result) {
        unless ($not_really) {
          if ($preserve_mode) {
            chmod($st->mode(), $target)
                or warn sprintf("Could not chmod \"%s\" to 0%o (%s)\n",
                                $target, $st->mode(), "$!");
          }
          if ($preserve_owner) {
            chown($st->uid(), $st->gid(), $target)
                or warn sprintf("Could not chown \"%s\" to %d/%d (%s)\n",
                                $target, $st->uid(), $st->gid(), "$!");
          }
          if ($preserve_time) {
            utime($atime, $mtime, $target)
                or warn sprintf("Could not reset atime/mtime of \"%s\" to %d/%d (%s)\n",
                                $target, $atime, $mtime, "$!");
          }
        }
        $record->new_file($full, $st);
        $bytes += $st->size();
        ++$files;
      } else {
        warn "Could not copy \"$full\" to \"$target\" ($!)\n";
      }
    } elsif (!$do_copy && $was_name_checked) {
      if ($verbose > 1) {
        warn sprintf("U %s <%d:%d> -> <%d:%d>\n",
                     $full, $ldev, $lino, $dev, $ino);
      } else {
        warn sprintf("U %s\n", $full);
      }
      $record->new_update($full, $st, $hist_index);
    } elsif ($too_big) {
      warn "B $full ($size)\n" if $verbose;
    } elsif ($hist_index && $record->is_new_name_for_inode($full, $dev, $ino, @other_devs)) {
      warn "H $full\n" if $verbose;
      $record->new_hardlink($full, $hist_index);
    } elsif ($do_copy && ($size == 0)) {
      warn "Z $full\n" if $verbose;
      $record->new_empty_file($full, $st);
    } elsif (defined($hist_entry)) {
      warn sprintf("%d %s\n", $bsession, $full) if ($verbose > 1);
    }
  }                             # else special file, ignore
}

sub session_root {
  my $session_num = shift;

  my $result = "$archive_root/$fileset";
  my $n = $session_num;
  while ($n >= 100) {
    $result .= '/';

    # the largest power of ten <= $n
    my $p = '1' . ('0' x (length($n) - 1));

    # the largest multiple of $p <= $n
    my $d = $p * int($n/$p);

    $result .= $d;

    $n %= $p;
  }
  return "$result/$session_num";
}

sub session_file {
  my($session_num, $fullpath) = @_;
  return session_root($session_num) . $fullpath;
}

sub interpret_size {
  my $str = shift;

  if ($str =~ /^(\d+)(\s*g)/i) {
    return $1 * 1024 * 1024 * 1024;
  }

  if ($str =~ /^(\d+)(\s*m)/i) {
    return $1 * 1024 * 1024;
  }

  if ($str =~ /^(\d+)(\s*k)/i) {
    return $1 * 1024;
  }

  if ($str =~ /\D/) {
    die "Non-numeric string: $str\n";
  }

  return $str;
}

sub equivalent_devs {
  my($dev1, @other_devs) = @_;

  my $dev1_equivs = $devmap{$dev1};

  foreach my $other_dev (@other_devs) {
    return undef unless grep { $_ == $other_dev } @$dev1_equivs;
  }
  return 1;
}

################################################################

package Record;

use strict;
use warnings;
no warnings qw(numeric);

# Version 4 record format:
#
# { version => 4,
#   fileset => <name>,
#   session => <number>,
#   session_start_time => <unix timestamp>,
#   session_end_time => <unix timestamp>,
#   names => { PATHNAME => [NAME_HISTORY_ITEM, ...], ... }
#   mtimes => { DEV => { INO => [INODE_HISTORY_ITEM, ...], ... }, ... }
# }
#
# NAME_HISTORY_ITEM:
#  [DEV, INO, SESSION] (file or hardlink)
#  [TARGET, SESSION] (symlink)
#
# INODE_HISTORY_ITEM:
#  [MTIME, SESSION, SIZE, NAME, ENSURED]

# Version 5 record format:
# { version => 5,
#   fileset => <name>,
#   sessions => [undef, [START_TIME, END_TIME], ...],
#   name_table => [undef, [PATH_TAIL, PARENT_INDEX, HISTORY_INDEX, ...], ...],
#   history => [undef, HISTORY_ITEM, ...],
#   name_indexes => { PATH_ELT => [NAME_INDEX, { PATH_ELT => ... }], ... },
#   inodes => { DEV => { INO => [HISTORY_INDEX, ...] } } }
#
# name_indexes and inodes are computed at run-time and not saved
#
# HISTORY_ITEM:
#  [FILE, NAME_INDEX, SESSION, ENSURED, DEV, INO, SIZE, MTIME] (file)
#  [HARDLINK, NAME_INDEX, SESSION, HISTORY_INDEX] (hardlink)
#  [SYMLINK, NAME_INDEX, SESSION, TARGET_NAME] (symlink)

# Beginning with version 6, the record format on disk differs from
# the data structure at runtime.  (Previously, the runtime structure
# was simply serialized to disk with Storable.)
# 
# The disk file looks like this
# (the hash at the beginning is Storable output for backward compatibility,
# the rest is human-readable)
# 
# {version => 6}FILESET
# NUM_SESSIONS
# SESSION1_START SESSION1_END
# SESSION2_START SESSION2_END
# ...
# NUM_NAMES
# NAME_TAIL1
# PARENT_INDEX1 HIST_INDEX1_1 HIST_INDEX1_2 ...
# NAME_TAIL2
# PARENT_INDEX2 HIST_INDEX2_1 HIST_INDEX2_2 ...
# ...
# NUM_HISTORIES
# TYPE1 NAME_INDEX1 SESSION1 REST1
# TYPE2 NAME_INDEX2 SESSION2 REST2
# ...
# 
# The RESTs are:
#  ENSURED DEV INO SIZE MTIME for files
#  HIST_INDEX for hardlinks
#  TARGET_NAME for symlinks
# 
# The runtime format looks like this
# { version => 6,
#   fileset => <name>,
#   sessions => [undef, [START_TIME, END_TIME], ...],
#   names => 't...\0ppppVt...\0ppppV...',
#   name_offsets => '....11112222....',
#   history => 'tnnnnssssVtnnnnssssV...',
#   history_offsets => '....11112222....',
#   name_indexes => { PATH_ELT => [NAME_INDEX, { PATH_ELT => ... }], ... },
#   inodes => { "DEV-INO" => '11112222....', ... },
#   additional_name_history => { NAME_INDEX => [HIST_INDEX, ...], ... } }
# 
# "names" and "history" are strings of "pack"ed data.
# 
# Each "t" in names is a path tail, terminated by \0
# This is followed by a 32-bit parent-name index,
# then a variable number of history indexes.  Each history
# index begins with a 0x1 byte and is followed by the 32-bit index.
# The end of the list is indicated with a 0x0 byte.
# 
# The byte offsets within "names" of each name-index record (1-based) appears
# as packed 32-bit numbers in name_offsets, the first four bytes of which
# are unused.
# 
# Each "t" in history is a one-byte history-entry type
# This is followed by two 32-bit numbers: name index, and session number
# For FILEs, this is then followed by 1 "ensured" byte
# and then four 32-bit numbers: dev, ino, size, mtime
# For SYMLINKs, session number is followed by the symlink target
# (as a NUL-terminated string)
# For HARDLINKs, session number is followed by a 32-bit history index
#
# Version 7 adds the new type EMPTY (an empty file, in the session record only)
# it's followed by three 32-bit numbers: dev, ino, mtime
#
# Version 8 adds the new type UPDATE.  Like EMPTY, it's a placeholder for a
# file not copied to the archive, in this case because its device/inode have
# changed (as when restoring files into a new filesystem) but the file's name
# and contents have not.  It's followed by three 32-bit numbers, (the new) dev
# and ino, and the index of the previous history entry for the file.
#
# history_offsets is to history as name_offsets is to names
# 
# values in the "inodes" hash are packed arrays of 32-bit history indexes
# 
# additional_name_history is where new name-history accumulates at runtime
# so it doesn't have to be spliced into "names," which would require
# updating all the offsets in name_offsets
# 
# Using packed strings like this yields enormous memory savings at runtime.

use constant RECORD_VERSION => 8;

use constant FILE => 1;
use constant HARDLINK => 2;
use constant SYMLINK => 3;
use constant EMPTY => 4;
use constant UPDATE => 5;

use Carp qw(confess);
use IO::File;
use Storable qw(nstore_fd retrieve_fd);

sub new {
  my($type, $fileset) = @_;

  bless {
    version => RECORD_VERSION,
    fileset => $fileset,
    sessions => [undef],
    names => '',
    name_offsets => '....',
    history => '',
    history_offsets => '....',
    additional_name_history => {}
  }, $type;
}

sub load {
  my($type, $fh) = @_;
  my $self = retrieve_fd($fh);
  bless $self, $type;

  if (($self->{version} != int($self->{version}))
      || ($self->{version} < 1)
      || ($self->{version} > RECORD_VERSION)) {
    die sprintf("Unknown record version %s\n", $self->{version});
  }

  if ($self->{version} <= 4) {
    warn sprintf("Converting session record from old format %d to newer format 5\n",
                 $self->{version});

    $self->{name_table} = [undef]; # index 0 is unused
    my @prehistory = (undef);
    my %file_entries_by_name_index_session;
    foreach my $name (keys %{$self->{names}}) {
      my $hist = $self->{names}->{$name};
      my $name_index = $self->name_index($name, 1);
      my $name_entry = $self->{name_table}->[$name_index];
      for (my $i = $#$hist; $i >= 0; --$i) {
        my $hist_item = $hist->[$i];
        if (@$hist_item == 3) {
          # file or hardlink
          my($dev, $ino, $session) = @$hist_item;
          my $inode_history = $self->{mtimes}->{$dev}->{$ino};
          foreach my $inode_history_item (@$inode_history) {
            my($imtime, $isession, $isize, $iname, $iensured) = @$inode_history_item;
            if ($isession <= $session) {
              if (!defined($iname) || ($name eq $iname)) {
                # file
                my $hist_index = $file_entries_by_name_index_session{"$name_index-$session"};
                if ($hist_index) {
                  # warn "DEBUG: Found writeahead file entry ($hist_index) for $name/$session\n";
                } else {
                  push(@prehistory,
                       [scalar(@prehistory), [FILE, $name_index, $session, $iensured, $dev, $ino, $isize, $imtime]]);
                  $hist_index = $#prehistory;
                  $file_entries_by_name_index_session{"$name_index-$session"} = $hist_index;
                }
                push(@$name_entry, $hist_index);
              } else {
                # hardlink
                my $iname_index = $self->name_index($iname, 1);
                my $hist_index = $file_entries_by_name_index_session{"$iname_index-$isession"};
                if (!$hist_index) {
                  push(@prehistory,
                       [scalar(@prehistory), [FILE, $iname_index, $isession, $iensured, $dev, $ino, $isize, $imtime]]);
                  $hist_index = $#prehistory;
                  $file_entries_by_name_index_session{"$iname_index-$isession"} = $hist_index;
                  # warn "DEBUG: Created writeahead file entry ($hist_index) for $iname/$isession\n";
                }
                push(@prehistory,
                     [scalar(@prehistory), [HARDLINK, $name_index, $session, $hist_index]]);
                push(@$name_entry, $#prehistory);
              }
              last;
            }
          }
        } else {
          # symlink
          my($target, $session) = @$hist_item;
          push(@prehistory,
               [scalar(@prehistory), [SYMLINK, $name_index, $session, $target]]);
          push(@$name_entry, $#prehistory);
        }
      }
    }
    shift @prehistory;          # remove the leading undef
    @prehistory = sort { $a->[1]->[2] <=> $b->[1]->[2] } @prehistory;
    $self->{history} = [undef];    # index 0 is unused
    my %prehistory_map;
    foreach my $prehistory_entry (@prehistory) {
      my($prehistory_index, $hist_entry) = @$prehistory_entry;
      $prehistory_map{$prehistory_index} = @{$self->{history}};
      push(@{$self->{history}}, $hist_entry);
    }
    for (my $i = 1; $i < @{$self->{history}}; ++$i) {
      my $hist_entry = $self->{history}->[$i];
      if ($hist_entry->[0] == HARDLINK) {
        $hist_entry->[3] = $prehistory_map{$hist_entry->[3]};
      }
    }
    for (my $i = 1; $i < @{$self->{name_table}}; ++$i) {
      my $name_entry = $self->{name_table}->[$i];
      for (my $j = 2; $j < @$name_entry; ++$j) {
        $name_entry->[$j] = $prehistory_map{$name_entry->[$j]};
      }
    }

    @prehistory = ();
    %prehistory_map = ();

    delete $self->{names};
    delete $self->{mtimes};

    $self->{sessions} = [];
    $self->{sessions}->[delete $self->{session}] =
        [delete $self->{session_start_time},
         delete $self->{session_end_time}];
  }

  if ($self->{version} <= 5) {
    warn sprintf("Converting session record from old format 5 to new format %d\n",
                 RECORD_VERSION);

    my $names = '';
    my $name_offsets = '....';
    my $name_table = delete $self->{name_table};
    for (my $i = 1; $i < @$name_table; ++$i) {
      my $entry = $name_table->[$i];
      my($tail, $parent_index, @hist) = @$entry;
      $name_offsets .= pack('N', length($names));
      $names .= pack('Z*N', $tail, ($parent_index || 0));
      while (@hist) {
        $names .= pack('CN', 1, shift @hist);
      }
      $names .= pack('C', 0);
    }
    $self->{names} = $names;
    $self->{name_offsets} = $name_offsets;

    my $history = '';
    my $history_offsets = '....';
    my $old_history = delete $self->{history};
    for (my $i = 1; $i < @$old_history; ++$i) {
      my $entry = $old_history->[$i];
      my($type, $name_index, $session, @rest) = @$entry;

      $history_offsets .= pack('N', length($history));
      $history .= pack('CNN', $type, $name_index, $session);
      if ($type == FILE) {
        my($ensured, $dev, $ino, $size, $mtime) = @rest;
        $history .= pack('CNNNN', $ensured, $dev, $ino, $size, $mtime);
      } elsif ($type == HARDLINK) {
        $history .= pack('N', $rest[0]); # another hist_index
      } else {                  # symlink
        $history .= pack('Z*', $rest[0]);
      }
    }
    $self->{history} = $history;
    $self->{history_offsets} = $history_offsets;
  }

  if ($self->{version} >= 6) {
    chomp(my $fileset = $fh->getline());
    $self->{fileset} = $fileset;

    chomp(my $nsessions = $fh->getline());
    my @sessions = (undef);
    while ($nsessions-- > 0) {
      chomp(my $session = $fh->getline());
      my($start_time, $end_time) = split(/ /, $session);
      push(@sessions, [$start_time, $end_time]);
    }
    $self->{sessions} = \@sessions;

    chomp(my $nnames = $fh->getline());
    my $names = '';
    my $name_offsets = '....';
    while ($nnames-- > 0) {
      chomp(my $tail = $fh->getline());
      chomp(my $info = $fh->getline());
      my($parent, @hist) = split(/ /, $info);
      $name_offsets .= pack('N', length($names));
      $names .= pack('Z*N', $tail, $parent);
      while (@hist) {
        $names .= pack('CN', 1, shift @hist);
      }
      $names .= pack('C', 0);
    }
    $self->{names} = $names;
    $self->{name_offsets} = $name_offsets;

    chomp(my $nhistory = $fh->getline());
    my $history = '';
    my $history_offsets = '....';
    while ($nhistory-- > 0) {
      chomp(my $entry = $fh->getline());
      my($type, $name_index, $session, $rest) = split(/ /, $entry, 4);
      $history_offsets .= pack('N', length($history));
      my @entry = ($type, $name_index, $session);
      $history .= pack('CNN', $type, $name_index, $session);
      if ($type == FILE) {
        my($ensured, $dev, $ino, $size, $mtime) = split(/ /, $rest);
        $history .= pack('CNNNN', $ensured, $dev, $ino, $size, $mtime);
      } elsif ($type == HARDLINK) {
        $history .= pack('N', $rest); # another hist_index
      } elsif ($type == EMPTY) {
        my($dev, $ino, $mtime) = split(/ /, $rest);
        $history .= pack('NNN', $dev, $ino, $mtime);
      } elsif ($type == UPDATE) {
        my($dev, $ino, $hist_index) = split(/ /, $rest);
        $history .= pack('NNN', $dev, $ino, $hist_index);
      } else {                  # symlink
        $history .= pack('Z*', $rest);
      }
    }
    $self->{history} = $history;
    $self->{history_offsets} = $history_offsets;
  }

  return $self;
}

sub store {
  my($self, $file) = @_;

  my $fh = new IO::File(">$file")
      or return undef;

  my %version_hash = (version => RECORD_VERSION);
  my $nstore_result = nstore_fd(\%version_hash, $fh);

  $fh->print($self->{fileset}, "\n");
  $fh->print($#{$self->{sessions}}, "\n");
  for (my $i = 1; $i < @{$self->{sessions}}; ++$i) {
    if (my $session = $self->{sessions}->[$i]) {
      my($start_time, $end_time) = @$session;
      $fh->printf("%d %d\n", $start_time, $end_time);
    } else {
      $fh->print("0 0\n");
    }
  }
  my $max_name_index = $self->max_name_index();
  $fh->print($max_name_index, "\n");
  for (my $i = 1; $i <= $max_name_index; ++$i) {
    my $entry = $self->name_entry_at_index($i);
    my($tail, $parent, @hist) = @$entry;
    $parent ||= 0;
    $fh->print($tail, "\n");
    $fh->print(join(' ', $parent, @hist), "\n");
  }
  my $max_history_index = $self->max_history_index();
  $fh->print($max_history_index, "\n");
  for (my $i = 1; $i <= $max_history_index; ++$i) {
    my $entry = $self->history_entry_at_index($i);
    my($type, $name_index, $session, @rest) = @$entry;
    $fh->print("$type $name_index $session ");
    if ($type == FILE) {
      my($ensured, $dev, $ino, $size, $mtime) = @rest;

      $ensured ||= 0;
      $size = -1 unless defined($size);

      $fh->print("$ensured $dev $ino $size $mtime");
    } elsif ($type == HARDLINK) {
      my($history_index) = @rest;
      $fh->print($history_index);
    } elsif ($type == EMPTY) {
      my($dev, $ino, $mtime) = @rest;
      $fh->print("$dev $ino $mtime");
    } elsif ($type == UPDATE) {
      my($dev, $ino, $hist_index) = @rest;
      $fh->print("$dev $ino $hist_index");
    } else {                    # SYMLINK
      my($target_name) = @rest;
      $fh->print($target_name);
    }
    $fh->print("\n");
  }
  $fh->close();

  return 1;
}

sub fileset { $_[0]->{fileset} }
sub version { $_[0]->{version} }

sub set_fileset { $_[0]->{fileset} = $_[1] }
sub set_version { $_[0]->{version} = $_[1] }

sub session {
  my $self = shift;
  return $#{$self->{sessions}};
}

sub session_start_time {
  my $self = shift;
  return $self->{sessions}->[$#{$self->{sessions}}]->[0];
}

sub session_end_time {
  my $self = shift;
  return $self->{sessions}->[$#{$self->{sessions}}]->[1];
}

sub set_session_times {
  my($self, $session, $start, $end) = @_;

  $self->{sessions}->[$session] = [$start, $end];
}

sub history_offset_at_index {
  my($self, $index) = @_;

  return unpack('N', substr($self->{history_offsets}, 4 * $index, 4));
}

sub history_entry_at_index {
  my($self, $index) = @_;

  my $offset = $self->history_offset_at_index($index);

  # type, name_index, session
  my @result = unpack('CNN', substr($self->{history}, $offset, 9));
  $offset += 9;

  my $type = $result[0];

  if ($type == FILE) {
    # ensured, dev, ino, size, mtime
    push(@result,
         unpack('CNNNN', substr($self->{history}, $offset, 17)));
    $offset += 17;
  } elsif ($type == HARDLINK) {
    push(@result,
         unpack('N', substr($self->{history}, $offset, 4)));
    $offset += 4;
  } elsif ($type == EMPTY) {
    # dev, ino, mtime
    push(@result,
         unpack('NNN', substr($self->{history}, $offset, 12)));
  } elsif ($type == UPDATE) {
    # dev, ino, hist_index
    push(@result,
         unpack('NNN', substr($self->{history}, $offset, 12)));
  } else {                      # symlink
    my $target_name = unpack('Z*',
                             substr($self->{history},
                                    $offset, $path_max + 1));
    $offset += (1 + length($target_name));
    push(@result, $target_name);
  }

  return \@result;
}

sub ensure_inodes {
  my $self = shift;

  if (!defined($self->{inodes})) {
    my %inodes;

    for (my $i = 1; $i <= $self->max_history_index(); ++$i) {
      my $hist_entry = $self->history_entry_at_index($i);
      my($type, @rest) = @$hist_entry;
      my($dev, $ino);
      if ($type == FILE) {
        ($dev, $ino) = @rest[3, 4];
      } elsif (($type == EMPTY) || ($type == UPDATE)) {
        ($dev, $ino) = @rest[2, 3];
      } else {
        next;
      }
      my $key = "$dev-$ino";
      $inodes{$key} = '' unless defined($inodes{$key});
      $inodes{$key} .= pack('N', $i);
    }

    $self->{inodes} = \%inodes;
  }

  return $self->{inodes};
}

sub name_entry_at_index {
  my($self, $index) = @_;

  my $offset = unpack('N', substr($self->{name_offsets}, 4 * $index, 4));
  my $name_tail = unpack('Z*',
                         substr($self->{names}, $offset, $path_max + 1));
  $offset += (1 + length($name_tail));

  my $parent_index = unpack('N', substr($self->{names}, $offset, 4));
  $offset += 4;

  my @result = ($name_tail, $parent_index);
  while (unpack('C', substr($self->{names}, $offset++, 1))) {
    push(@result, unpack('N', substr($self->{names}, $offset, 4)));
    $offset += 4;
  }
  if ($index) {
    if (my $additional_history = $self->{additional_name_history}->{$index}) {
      push(@result, @$additional_history);
    }
  }

  return \@result;
}

# returns the hash for the given index
sub ensure_name_indexes_aux {
  my($self, $index) = @_;

  return $self->{name_indexes} unless $index;
  my $entry = $self->name_entry_at_index($index);
  my($name, $parent_index) = @$entry;
  my $hash = $self->ensure_name_indexes_aux($parent_index);
  my $lookup = $hash->{$name};
  $lookup->[1] ||= {};
  return $lookup->[1];
}

sub ensure_name_indexes {
  my $self = shift;

  if (!defined($self->{name_indexes})) {
    $self->{name_indexes} = {};

    for (my $i = 1; $i <= $self->max_name_index(); ++$i) {
      my $entry = $self->name_entry_at_index($i);
      my($name, $parent_index) = @$entry;
      my $hash = $self->ensure_name_indexes_aux($parent_index);
      $hash->{$name} = [$i];
    }
  }

  return $self->{name_indexes};
}

sub name_index {
  my($self, $name_arg, $create) = @_;

  confess('name_index: undefined name')
      unless defined($name_arg);

  return int($name_arg) if int($name_arg);

  my $name = $name_arg;

  my $hash = $self->ensure_name_indexes();
  my $index;

  my @elts = ($name =~ m|([^/]+)|g);
  foreach my $elt (@elts) {
    my $lookup = $hash->{$elt};
    if (!defined($lookup)) {
      return undef unless $create;
      my $new_offset = length($self->{names});
      $self->{names} .= pack('Z*NC', $elt, ($index || 0), 0);
      $index = length($self->{name_offsets}) / 4;
      $self->{name_offsets} .= pack('N', $new_offset);
      my $new_hash = {};
      my $new_entry = [$index, $new_hash];
      $hash->{$elt} = $new_entry;
      $hash = $new_hash;
    } else {
      ($index, $hash) = @$lookup;
    }
  }

  return $index;
}

sub index_to_name {
  my($self, $index_arg) = @_;

  if (!int($index_arg)) {
    return $index_arg;
  }

  my $elt = $self->name_entry_at_index($index_arg);
  my $name = $elt->[0];
  my $parent_index = $elt->[1];
  my $result = ($parent_index ?
                $self->index_to_name($parent_index) :
                '');
  return "$result/$name";
}

sub is_new_name_for_inode {
  my($self, $name_arg, $dev, $ino, @other_devs) = @_;

  my $name_index = $self->name_index($name_arg, 1);

  my $entry = $record->latest_for_name($name_index)
      or return 1;

  my $type = $entry->[0];

  if ($type == SYMLINK) {
    return 1;
  }

  while ($type == HARDLINK) {
    my $hist_index = $entry->[3];
    $entry = $self->history_entry_at_index($hist_index);
    $type = $entry->[0];
  }

  my($odev, $oino);
  if (($type == EMPTY) || ($type == UPDATE)) {
    ($odev, $oino) = ($entry->[3], $entry->[4]);
  } else {                      # assert: $type == FILE
    ($odev, $oino) = ($entry->[4], $entry->[5]);
  }

  return 1 if ($ino != $oino);
  return 1 unless grep { $_ == $odev } ($dev, @other_devs);
  return undef;
}

sub max_history_index {
  my $self = shift;
  return length($self->{history_offsets}) / 4 - 1;
}
sub max_name_index {
  my $self = shift;
  return length($self->{name_offsets}) / 4 - 1;
}

sub foreach_name {              # xxx todo: sort by name as in early versions?
  my($self, $cb) = @_;

  for (my $i = 1; $i <= $self->max_name_index(); ++$i) {
    my $entry = $self->name_entry_at_index($i);
    if (@$entry > 2) {          # i.e., it has any history; is a leaf
      &$cb($self->index_to_name($i));
    }
  }
}

sub foreach_name_history {
  my($self, $name_arg, $file_cb, $hardlink_cb, $symlink_cb, $empty_cb, $update_cb) = @_;

  my $name_index = $self->name_index($name_arg);
  my $name_entry = $self->name_entry_at_index($name_index);
  for (my $i = $#$name_entry; $i >= 2; --$i) {
    my $hist_index = $name_entry->[$i];
    my $hist_entry = $self->history_entry_at_index($hist_index);
    my $type = $hist_entry->[0];
    if ($file_cb && $type == FILE) {
      &$file_cb(@{$hist_entry}[1 .. $#$hist_entry]);
    } elsif ($hardlink_cb && $type == HARDLINK) {
      my $other_hist_index = $hist_entry->[3];
      my $other_hist_entry = $self->history_entry_at_index($other_hist_index);
      &$hardlink_cb($name_index, $hist_entry->[2], @{$other_hist_entry}[1 .. $#$other_hist_entry]);
    } elsif ($symlink_cb && $type == SYMLINK) {
      &$symlink_cb(@{$hist_entry}[1 .. $#$hist_entry]);
    } elsif ($empty_cb && $type == EMPTY) {
      &$empty_cb(@{$hist_entry}[1 .. $#$hist_entry]);
    } elsif ($update_cb && $type == UPDATE) {
      &$update_cb(@{$hist_entry}[1 .. $#$hist_entry]);
    }
  }
}

sub latest_for_inode {
  my($self, $dev, $ino, @other_devs) = @_;

  my $inodes = $self->ensure_inodes();

  if (@other_devs) {
    my $latest_session;
    my $result_index;
    my $result_entry;
    foreach my $d ($dev, @other_devs) {
      my($index, $entry) = $self->latest_for_inode($d, $ino);
      next unless defined($index);
      # my $entry = $self->history_entry_at_index($index);
      my $session = $entry->[2];
      if (!defined($latest_session)
          || ($session > $latest_session)) {
        $result_index = $index;
        $result_entry = $entry;
        $latest_session = $session;
      }
    }
    return ($result_index, $result_entry);
  }

  my $key = "$dev-$ino";
  my $hist = $inodes->{$key};
  return undef unless (defined($hist) && length($hist));
  my $index = unpack('N', substr($hist, -4));
  return ($index, $self->history_entry_at_index($index));
}

sub latest_for_name {
  my($self, $name_arg, $return_hist_index) = @_;

  my $name_index = $self->name_index($name_arg);
  return undef unless $name_index;
  my $name_entry = $self->name_entry_at_index($name_index);
  return undef unless @$name_entry > 2;
  my $hist_index = $name_entry->[$#$name_entry];
  my $entry = $self->history_entry_at_index($hist_index);
  return ($return_hist_index ? ($hist_index, $entry) : $entry);
}

sub set_ensured {
  my($self, $hist_index) = @_;

  my $hist_entry = $self->history_entry_at_index($hist_index);
  my $hist_offset = $self->history_offset_at_index($hist_index);
  # my $type = unpack('C', substr($self->{history}, $hist_offset, 1)); # xxx check it's FILE
  substr($self->{history}, $hist_offset + 9, 1) = pack('C', 1);
}

sub lookup_file {
  my($self, $name_arg) = @_;

  my $entry = $self->latest_for_name($name_arg);
  return undef unless defined($entry);
  return undef unless $entry->[0] == FILE;
  return $entry;
}

sub lookup_symlink {
  my($self, $name) = @_;

  my $hist_entry = $self->latest_for_name($name);
  return undef unless $hist_entry;
  return undef unless $hist_entry->[0] == SYMLINK;
  return $hist_entry->[3];
}

sub new_hardlink {
  my($self, $name_arg, $history_index) = @_;

  $self->new_history_item(HARDLINK, $name_arg, $session, $history_index);
}

sub new_symlink {
  my($self, $name_arg, $target) = @_;

  $self->new_history_item(SYMLINK, $name_arg, $session, $target);
}

sub new_file {
  my($self, $name_arg, $st) = @_;

  my($dev, $ino) = ($st->dev(), $st->ino());

  my $hist_index =
      $self->new_history_item(FILE, $name_arg, $session,
                              0, $dev, $ino,
                              $st->size(), $st->mtime());

  $self->maybe_update_inodes($dev, $ino, $hist_index);
}

sub new_empty_file {
  my($self, $name_arg, $st) = @_;

  my($dev, $ino) = ($st->dev(), $st->ino());

  my $hist_index =
      $self->new_history_item(EMPTY, $name_arg, $session,
                              $dev, $ino, $st->mtime());

  $self->maybe_update_inodes($dev, $ino, $hist_index);
}

sub new_update {
  my($self, $name_arg, $st, $old_hist_index) = @_;

  my($dev, $ino) = ($st->dev(), $st->ino());

  my $new_hist_index = $self->new_history_item(UPDATE, $name_arg, $session,
                                               $dev, $ino, $old_hist_index);

  $self->maybe_update_inodes($dev, $ino, $new_hist_index);
}

sub maybe_update_inodes {
  my($self, $dev, $ino, $hist_index) = @_;

  if (defined(my $inodes = $self->{inodes})) {
    my $key = "$dev-$ino";
    $inodes->{$key} = '' unless defined($inodes->{$key});
    $inodes->{$key} .= pack('N', $hist_index);
  }
}

sub new_history_item {
  my($self, $kind, $name_arg, $session, @rest) = @_;

  my $name_index = $self->name_index($name_arg, 1);
  my $hist_offset = length($self->{history});
  my $hist_index = length($self->{history_offsets}) / 4;
  $self->{history} .= pack('CNN', $kind, $name_index, $session);
  if ($kind == FILE) {
    # ensured, dev, ino, size, mtime
    $self->{history} .= pack('CNNNN', @rest);
  } elsif ($kind == HARDLINK) {
    $self->{history} .= pack('N', @rest);
  } elsif ($kind == EMPTY) {
    $self->{history} .= pack('NNN', @rest);
  } elsif ($kind == UPDATE) {
    $self->{history} .= pack('NNN', @rest);
  } else {                      # SYMLINK
    $self->{history} .= pack('Z*', @rest);
  }
  $self->{history_offsets} .= pack('N', $hist_offset);

  $self->{additional_name_history} ||= {};
  $self->{additional_name_history}->{$name_index} ||= [];
  push(@{$self->{additional_name_history}->{$name_index}}, $hist_index);

  ++$changes;

  return $hist_index;
}

__END__

=head1 NAME

ibid - incremental backups to infinite disk

=head1 SYNOPSIS

ibid [options] FILESET TARGET

ibid --dump FILESET [PATTERN ...]

=head1 DESCRIPTION

Ibid is a simple tool for performing incremental backups.  An incremental
backup is a backup of just those files that have changed since the
last backup.

Ibid backs up files to any destination that can be mounted as a
writable filesystem.  It was developed for the author's use with
Jungledisk (L<http://www.jungledisk.com/>, a commercial "infinite
filesystem" service) in conjunction with davfs
(L<http://dav.sourceforge.net/>), but any similar service or mountable
media should also work.

To back up files, you must first define a I<fileset>, which is an
ordinary text file listing files and directories to include in the
backup, one per line.  This file, named F<INCLUDE>, is stored in the
I<fileset directory>, which is a directory with the same name as the
fileset and is normally found under the directory F<$HOME/.ibid>.  For
example, the list of inclusions for a fileset named I<foo> would be in
F<$HOME/.ibid/foo/INCLUDE> and might contain the following:

 /home
 /var
 /etc
 /usr/local

Each item in the fileset definition must be a full pathname.  Directories are
processed recursively but filesystem boundaries (a.k.a. mountpoints) are never
crossed.

You may also define a set of I<exclusions> in
F<$HOME/.ibid/I<fileset>/EXCLUDE>.  This is a list of Perl regular
expressions, one per line.  These are matched against each candidate
file and directory; if a match is found, the file or directory is
excluded from the backup.  The pattern is matched against the full
pathname of the candidate file or directory.  You must explicitly use
C<^> and C<$> to anchor your match to the beginning and end of the
pathname.  Example:

 ~$
 /#[^/]+#$
 /tmp/
 /core$
 ^/home/bobg/src/extern/

This example excludes Emacs-style backup and checkpoint files; everything
under any directory named F<tmp>; any file named F<core>; and the tree rooted
at F</home/bobg/src/extern/>.

With the fileset and optional exclusions defined, start a backup with:

 ibid FILESET TARGET

where TARGET is a directory on the destination filesystem; e.g.,
F</mnt/jungledisk>.  Ibid will copy eligible files from the fileset to
a new tree rooted at F<TARGET/FILESET/1>.  Here, 1 denotes this is the
first session for FILESET; each time ibid runs it creates a new
session.  A record of the backup is stored in
F<$HOME/.ibid/FILESET/sessions/FILESET.1> and in
F<TARGET/FILESET/1/FILESET.1>.  If you later reinvoke

 ibid FILESET TARGET

then any files created or changed since session 1 will be copied to a
tree rooted at F<TARGET/FILESET/2>, and a cumulative record of all
sessions will be written to F<FILESET.2>.

Once the session number exceeds 100, an extra level is added to the target
paths: e.g., session 108 is rooted at F<TARGET/FILESET/100/108>, session 412
is rooted at F<TARGET/FILESET/400/412>.  Another level is added for each new
power of ten (1000, 10000, 100000, etc.), so that, for instance, session 7311
is rooted at F<TARGET/FILESET/7000/300/7311>, and session 29582 is rooted at
F<TARGET/FILESET/20000/9000/500/29582>.  Path elements that would start with a
0 are omitted; e.g., session 4006 is rooted at F<TARGET/FILESET/4000/4006>,
I<not> at F<TARGET/FILESET/4000/000/4006>.

Symlinks are not copied to the target filesystem, but they are added
to the session record.  Hard links are detected and are also added to
the session record only.  In both cases the target of the link is
recorded too, permitting a later "restore."

If a file is renamed between sessions but remains on the same
filesystem and is otherwise unchanged, the new name is treated as a
hard link: the file's contents are not recopied to the target
filesystem, but the name is added to the session record.

Ibid uses a file's device and inode numbers (as well as its size and
last-modified date) to track its identity across renames.  Under some
circumstances, a filesystem's device number can change, making it look to ibid
as if all the files on it are new and causing them to be recopied to the
target filesystem.  To prevent this, create a file named
F<$HOME/.ibid/.devmap>.  Each line in this file is taken to be a list of
device numbers all referring to the same filesystem.  So if /mnt/space has had
device numbers 2097, 2143, and 2146; and if /mnt/extra has had device numbers
3654 and 3670, then the .devmap file should look like this:

  2097 2143 2146
  3654 3670

To discover the device number of a filesystem, choose any file I<F> on the
filesystem and run this command:

  perl -e '@x=stat("F");print $x[0],"\n"'

You may compress old session records in F<$HOME/.ibid/*/sessions> with
C<gzip>, C<bzip2>, or C<xz>.  Ibid will still be able to read them.  Note that
ibid only needs access to the latest session record for a given fileset; the
data in the session records is cumulative.  Older session records may be
archived elsewhere or even deleted if you're feeling confident.

=head1 OPTIONS

=over 4

=item --no (or -n)

Don't make any changes: files are not copied and a new session record
is not written.  With C<-v>, this is a good way to preview what a
backup would do.

=item --verbose (or -v)

Increase verbosity.  Each C<-v> adds more.

=item --dir DIR (or -d DIR)

Look for filesets and sessions in I<DIR>.  Default is C<$HOME/.ibid>.

=item --session-size-limit LIMIT (or -S LIMIT)

Limit this session to copying I<LIMIT> bytes.  Copying ends I<after>
the file that crosses the I<LIMIT> threshold, so it's possible for
I<LIMIT> to be exceeded by quite a lot.

I<LIMIT> may have the suffix C<k>, C<m>, or C<g> to denote kilobytes,
megabytes, or gigabytes.

=item --session-files-limit LIMIT (or -F LIMIT)

Limit this session to copying I<LIMIT> files.  Empty files, hardlinks, and
symlinks do not count against this total.  If both C<--session-files-limit>
and C<--session-size-limit> are used, the session ends after the first limit
is reached, whichever that is.

=item --single-file-size-limit LIMIT (or -s LIMIT)

Skip files larger than I<LIMIT> bytes.  By default, files are not excluded
based on size.

I<LIMIT> may have the suffix C<k>, C<m>, or C<g> to denote kilobytes,
megabytes, or gigabytes.

=item --preserve-mode/--nopreserve-mode (or -m)

Copy the mode bits of backed-up files or don't.  Default is to copy
them.

=item --preserve-owner/--nopreserve-owner (or -o)

Copy the uid and gid of backed-up files or don't.  Default is to copy
them.

=item --preserve-time/--nopreserve-time (or -t)

Copy the last-access and last-modification times of backed-up files or
don't.  Default is I<not> to copy them (because Jungledisk+davfs,
which the author used during development of ibid, doesn't support that
operation [yet?]).

=item --restore-atime/--norestore-atime (or -a)

After copying files to the target filesystem, restore the file's
original last-access time or don't.  Default is to restore.

=item --recheck-mtime/--norecheck-mtime

If C<--restore-atime> is in effect, enabling this option causes ibid
to double-check the original file's last-modification
time against its original values after the last-access time is
restored.  This is to detect the bug in some versions of the Linux
kernel and/or the ext3 filesystem that caused utime(2) to store bogus
values.  Default is to perform the recheck and C<die> if the bug is
detected.

=item --ensure/--noensure (or -e)

Before deciding to skip a file because it's unchanged from its last
backup, ibid will check the supposed backup copy to see if it's
present and has the proper size.  If not, it'll get backed up "again."
This option defends against the possibility that your "infinite disk"
service dropped some data undetected, as can happen with Jungledisk
(for instance) when using "background mode" and three consecutive
retries all fail (as of version 1.13, see
L<http://forum.jungledisk.com/viewtopic.php?t=421>).

For purposes of this option, ibid assumes all prior sessions can be
found under the same TARGET root as the current one.

Using this option is costly in time and network I/O, so the default is
off.  Once a file's presence in the archive is verified, it is flagged
as "ensured" and ibid will not try to reverify it in future sessions.

=item --check-names

When testing a file to see if it's been backed up, ibid uses the file's device
and inode number to look it up in the backup history.  If a collection of
files is copied to a new filesystem under identical names (as in the case of a
restore-from-backup), they'll have new device and inode numbers and so will
look like they need backing up again.  To avoid this, use --check-names the
first time running ibid after such a file copy or restore.  It looks up the
file by name when device/inode lookup fails, detects the file is unchanged
since an earlier backup, and writes a new record to the backup history
recording the new device/inode pair.

=item --prune-sessions N

After a successful run, and after storing the session file, "prune" the
sessions directory by removing all but the I<N> latest session files (since
each session file duplicates all the information in its predecessors).  By
default, no old session files are removed.

=item --version (or -V)

Report the version of ibid and exit.

=item --dump (or -D)

Rather than perform a backup, ibid dumps the contents of a session record (the
latest session for the given fileset) to standard output.  The information
includes the session number, the start and end times for that session, and the
complete list of filenames, including symlinks and hard links.  Beneath each
filename is a list of sessions in which that filename appears, and either
S<(a) the> last-modification time and size of the file when it was backed up
in that session, S<(b) the> symbol C<S-E<gt>> (to connote a symlink) and the
target of the link, S<(c) the> symbol C<H-E<gt>> (to connote a hard link) and
the pathname under which the contents were most recently copied to the
archive, or S<(d) the> symbol C<Z> (to connote a zero-length file, which isn't
copied to the archive).  Additionally, if a file was backed up in any session
and its presence in the archive was verified with C<--ensure> in a subsequent
session, the entry will have an E after it.

After the fileset, you may specify on the command line any number of
Perl regular expressions; only matching filenames will be included in
the output.

=item --trim-report

Produces a primitive report (after completing a backup session) of files that
you may wish to delete from the target archive, either because they match an
exclusion rule (i.e., they were backed up in earlier sessions, before you
added the exclusion rule they now match) or because duplicate copies were made
(e.g., because a filesystem's device number changed with no corresponding
entry in F<.devmap>).

=back

=head1 FILES

=over 4

=item $HOME/.ibid/I<FILESET>/INCLUDE

=item $HOME/.ibid/I<FILESET>/EXCLUDE

=item $HOME/.ibid/.devmap

=item $HOME/.ibid/I<FILESET>/sessions/I<FILESET>.1, I<FILESET>.2, ...

=back

=head1 BUGS

=over 4

=item The rename-detecting algorithm can be fooled, meaning files may be
recorded as mere hardlinks to previously archived files when in fact they are
not and should be separately archived.  (For this to happen, a file must share
the same last-modified time and size as another file that previously resided
at the same device and inode, which is a fairly unlikely combination of
conditions.)

=back

=head1 SEE ALSO

L<perlre>, L<mount(8)>, L<utime(2)>

=head1 AUTHOR

Bob Glickstein <bobg@emphatic.com>