#!/local/perl586/bin/perl -w

# settings are located in $HOME/.corpus

use strict;
use Getopt::Long;

use vars qw(
    $corpusdir
    $opt_override
    $opt_tag
);
GetOptions(
    "tag=s" => \$opt_tag,
    "dir=s" => \$corpusdir,
    "override=s" => \$opt_override,
);

$opt_override ||= '';
$opt_tag ||= 'n';       # nightly is the default

use File::Path;
use File::Copy;
use Time::ParseDate;
use Cwd;
use POSIX qw(nice strftime);

use constant WEEK => 60*60*24;
nice(15);

# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
# the time of day when the mass-check tagging occurs; see
# http://wiki.apache.org/spamassassin/DateRev for more details.
use constant DATEREV_ADJ => - (8 * 60 * 60);

my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my %revision = ();
my %filesize = ();
my %dateline = ();
my %mtime = ();
my %logs_by_daterev = ();
my %is_net_daterev = ();
my %time = ();
my @files;
my @tmps = ();
my $skip = '';
my $time_start = time;
my $output_revpath;

&configure;
&init;

if ($corpusdir) {
  print "reading logs from '$corpusdir'\n";
}
else {
  $corpusdir = $opt{corpus};
  &update_rsync;
}

&locate;
&current;
&clean_up;

sub configure {
  # does rough equivalent of source
  open(C, $configuration) || die "open failed: $configuration: $!\n";
  my $pwd = getcwd;

  # add 'override' options
  my @lines = (<C>, split(/\|/, $opt_override));

  foreach $_ (@lines) {
	chomp;
	s/#.*//;
	if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
          my ($key, $val) = ($1, $2);
          $val =~ s/\$PWD/$pwd/gs;
	  $opt{$key} = $val;
	}
  }
  close(C);
}

sub clean_up {
  system "rm -f $opt{tmp}/*.$$ ".join(' ', @tmps);
}

sub init {
  $SIG{INT} = \&clean_up;
  $SIG{TERM} = \&clean_up;

  $ENV{RSYNC_PASSWORD} = $opt{password};
  $ENV{TIME} = '%e,%U,%S';
  $ENV{TZ} = 'UTC';
}

sub update_rsync {
  chdir $corpusdir;

  # allow non-running of rsync under some circumstances
  if ($opt{rsync_command}) {
    system $opt{rsync_command};
  } else {
    system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
  }

  # this block is no longer required -- we do sensible things with modtime
  # comparisons to work it out!
  if (0 && !$opt{always_update_html}) {
    if (-f "rsync.last") {
      open(FIND, "find . -type f -newer rsync.last |");
      my $files = "";
      while(<FIND>) {
        $files .= $_;
      }
      close(FIND);
      if (! $files) {
        print STDERR "no new corpus files\n";
        if (rand(24) > 1) {
          exit 0;
        }
        else {
          print STDERR "updating anyway\n";
        }
      }
    }
  }

  open(RSYNC, "> rsync.last");
  close(RSYNC);
  system "chmod +r *.log";
}

sub locate {
  opendir(CORPUS, $corpusdir);
  @files = sort readdir(CORPUS);
  closedir(CORPUS);

  @files = grep {
    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10 
  } @files;

  foreach my $file (@files) {
    my $tag = 0;
    my $headers = '';

    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
    while (my $line = <FILE>) {
      last if $line !~ /^#/;
      $headers .= $line;
      if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
        my ($datepre, $hh, $datepost) = ($1,$2,$3);
        
        my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
                    GMT => 1, PREFER_PAST => 1);

        $time{$file} = $timet;
      }
      elsif ($line =~ m/^# Date:\s*(\S+)/) {
        # a better way to do the above.  TODO: parse it instead
        $dateline{$file} = $1;
      }
      elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
        $revision{$file} = $1;
      }
    }
    close(FILE);

    my @s = stat("$corpusdir/$file");
    $filesize{$file} = $s[7];
    $mtime{$file} = $s[9];

    if (!defined $time{$file}) {
      warn "$corpusdir/$file: no time found, ignored\n"; next;
    }
    if (!defined $revision{$file}) {
      warn "$corpusdir/$file: no revision found, ignored\n"; next;
    }
    if ($revision{$file} eq 'unknown') {
      warn "$corpusdir/$file: not tagged with a revision, ignored\n"; next;
    }

    my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);

    $logs_by_daterev{$daterev} ||= [ ];
    push (@{$logs_by_daterev{$daterev}}, $file);

    if ($file =~ /-net-/) {
      $is_net_daterev{$daterev} = 1;
      print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 1)\n";
    }
    else {
      print "$corpusdir/$file: rev=$daterev time=$time{$file} (set 0)\n";
    }

    get_rulemetadata_for_revision($daterev, $revision{$file});
  }
}

sub sort_all {
  my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
  my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
  $a1 =~ s/^[\+\-]//;
  $b1 =~ s/^[\+\-]//;

  my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
  if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
  elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
  elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
  if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
  elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
  elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
  return $n;
}

sub time_filter {
  my ($after, $before) = @_;
  if (/time=(\d+)/) {
	return (($time_start - $1 >= WEEK * $after) &&
		($time_start - $1 < WEEK * $before));
  }
  return 0;
}

sub current {
  my $classes = $opt{output_classes};
  $classes ||= "DETAILS.new DETAILS.all DETAILS.age HTML.new HTML.all HTML.age NET.new NET.all NET.age";

  foreach my $entry (split(' ', $classes)) {
    $entry =~ /^(\S+)\.(\S+)$/;
    my $class = $1;
    my $age = $2;
    if (!$age) { warn "no age in $entry"; next; }

    foreach my $daterev (sort keys %logs_by_daterev) {
      my $rev;
      if ($daterev !~ /\/r(\d+)/) {
        warn "bad daterev: $daterev"; next;
      }
      $rev = $1;

      print STDERR "generating: $class.$age for $daterev\n";

      if ($class eq "NET") {
        next unless $is_net_daterev{$daterev};
      }

      gen_class ($daterev, $rev, $class, $age);
    }
  }
}

sub gen_class {
  my ($daterev, $rev, $class, $age) = @_;

  print STDERR "\ngenerating $daterev $class.$age:\n";

  return if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);

  my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}};
  my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}};

  print STDERR "input h: " . join(' ', @ham) . "\n";
  print STDERR "input s: " . join(' ', @spam) . "\n";

  chdir $corpusdir;

  # net vs. local
  if ($class eq "NET") {
    @ham = grep { /-net-/ } @ham;
    @spam = grep { /-net-/ } @spam;
  }

  # age
  if ($age =~ /(\d+)day/) {
    my $mtime = $1;
    @ham = grep { -M $_ < $mtime } @ham;
    @spam = grep { -M $_ < $mtime } @spam;
  }


  print STDERR "selected h: " . join(' ', @ham) . "\n";
  print STDERR "selected s: " . join(' ', @spam) . "\n";
  
  # we cannot continue if we have no files that match the criteria...
  # demand at least 1 ham and 1 spam file
  if (scalar @spam <= 0 || scalar @ham <= 0) {
    warn "not enough files found matching criteria ($daterev $class $age)\n";
    return;
  }

  my $dir = create_outputdir($daterev);

  my $fname = "$dir/$class.$age";

  # now, if the target file already exists, check to see if it's newer
  # than all the sources, make-style; if not, don't re-create it
  if (-f $fname) {
    my $targetfreshness = (-M $fname);
    my $needsrebuild = 0;

    foreach my $srcfile (@spam, @ham) {
      my $srcfreshness = (-M $srcfile);
      if ($targetfreshness > $srcfreshness) {     # src is fresher
        print "$fname is older than $srcfile: $targetfreshness > $srcfreshness\n";
        $needsrebuild = 1;
        last;
      }
    }

    if (!$needsrebuild) {
      print "existing: $fname, fresher than sources\n";
      return;
    }
  }

  my $when = scalar localtime time;
  print qq{creating: $fname
  started $when...
};
  my $bytes = 0;

  if ($class eq 'LOGS') {
    foreach my $f (@ham, @spam) {
      $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
      my $zf = "$fname-$f.gz";

      system("gzip -c < $f > $zf.$$");
      if ($? >> 8 != 0) {
        warn "gzip -c < $f > $zf.$$ failed";
      }

      rename("$zf.$$", $zf) or
                    warn "cannot rename $zf.$$ to $zf";
      $bytes += (-s $zf);
    }
  }
  else {
    my $tmpfname = "$fname.$$";

    open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
    print OUT "# ham results used for $daterev $class $age: " . join(" ", @ham) . "\n";
    print OUT "# spam results used for $daterev $class $age: " . join(" ", @spam) . "\n";
    print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n";

    push (@tmps, $tmpfname);

    my $flags = "";
    $flags = "-t net -s 1" if $class eq "NET";
    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
    $flags = "-o" if $class eq "OVERLAP";
    $flags = "-S" if $class eq "SCOREMAP";
    if ($opt{rules_dir}) {
      $flags .= " -c '$opt{rules_dir}'";
    }

    if ($age eq "all") {
      my %spam;
      my %ham;
      my @output;
      
      for my $file (@spam) {
        $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
      }
      for my $file (@ham) {
        $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
      }
      unlink "$opt{tmp}/ham.log.$$";
      unlink "$opt{tmp}/spam.log.$$";

      if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
        warn "no files found for $class.$age";
        return;
      }

      chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
      for my $user (sort keys %spam) {
        next unless $ham{$user};
        system("cat $corpusdir/$ham{$user} >> $opt{tmp}/ham.log.$$");
        system("cat $corpusdir/$spam{$user} >> $opt{tmp}/spam.log.$$");
        open(IN, "./hit-frequencies -TxpaP $flags $corpusdir/$spam{$user} $corpusdir/$ham{$user} |");
        while(<IN>) {
          chomp;
          push @output, "$_:$user\n";
        }
        close(IN);
      }
      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
      while(<IN>) {
        push @output, $_;
      }
      close(IN);
      for (sort sort_all @output) { print OUT; }
    }
    elsif ($age eq "age") {
      my @output;

      for my $which (("0-1", "1-2", "2-3", "3-6")) {
        my ($after, $before) = split(/-/, $which);
        # get and filter logs
        chdir $corpusdir;
        for my $type (("ham", "spam")) {
          open(TMP, "> $opt{tmp}/$type.log.$$");
          my @array = ($type eq "ham") ? @ham : @spam;
          for my $file (@array) {
            open(IN, $file) or warn "cannot read $file";
            while (<IN>) {
              print TMP $_ if time_filter($after, $before);
            }
            close(IN);
          }
          close (TMP);
        }
        # print out by age
        chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
        open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
        while(<IN>) {
          chomp;
          push @output, "$_:$which\n";
        }
        close(IN);
      }
      for (sort sort_all @output) { print OUT; }
    }
    elsif (@ham && @spam) {
      # get logs
      system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
      system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");

      chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";
      open(IN, "./hit-frequencies -TxpaP $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
      while(<IN>) { print(OUT); }
      close(IN);
    }

    $bytes = (-s OUT);
    close(OUT);
    rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
  }

  $when = scalar localtime time;
  print qq{created: $bytes bytes, finished at $when
URL:

  $opt{ruleqa_url}$output_revpath

};

}

sub mk_daterev {
  my ($timet, $rev, $tag) = @_;
  return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
}

sub create_outputdir {
  my ($revpath) = @_;
  my $dir = $opt{html} .'/'. $revpath;

  # print "output dir: $dir\n";
  if (!-d $dir) {
    my $prevu = umask 0;
    mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
    umask $prevu;
  }

  $output_revpath = $revpath;       # set the global
  $output_revpath =~ s/\//-/;       # looks nicer

  return $dir;
}

sub log_metadata_xml {
  my ($daterev, @files) = @_;
  my $str = '';

  # this is extracted into the info.xml file later by the gen_info_xml script
  foreach my $f (@files) {
    $str .= qq{
      <mclogmd file='$f'>
        <daterev>$daterev</daterev>
        <rev>$revision{$f}</rev>
        <fsize>$filesize{$f}</fsize>
        <mcstartdate>$dateline{$f}</mcstartdate>
        <mtime>$mtime{$f}</mtime>
      </mclogmd>
    };
  }

  $str =~ s/\s+/ /gs;  # on a single line please
  return '<mclogmds>'.$str.'</mclogmds>';
}

sub create_rulemetadata_dir {
  my $rev = shift;
  my $dir = "$opt{html}/rulemetadata/$rev";
  if (!-d $dir) {
    my $prevu = umask 0;
    mkpath([$dir], 0, oct($opt{html_mode})) or warn "failed to mkdir $dir";
    umask $prevu;
  }
  return $dir;
}

sub get_rulemetadata_for_revision {
  my ($daterev, $rev) = @_;

  my $dir = create_rulemetadata_dir($rev);

  # argh.  this is silly; ~bbmass/.corpus specifies "$PWD" in its
  # "tree" path, so we have to ensure we're in the 'masses' dir
  # for this to work!
  chdir "$opt{tree}/masses" or die "cannot chdir $opt{tree}/masses";

  my $cmd = "$opt{tree}/masses/rule-qa/get-rulemetadata-for-revision ".
                    "--rev=$rev --outputdir='$dir'";

  system($cmd);
  if ($? >> 8 != 0) {
    warn "'$cmd' failed";
  }
}

