#!/bin/sh -u # $0 ...list of mboxes ... # -Ian! D. Allen - idallen@idallen.ca - www.idallen.com # SAFECAT=' safecat tmp new ' # rm -f $OUT/* tmp/* new/* # rm idcache OUT=out if [ -t 2 -a -s idcache ] ; then while : ; do echo 1>&2 -n "$0: idcache exists - keep ? [Yn] " read ans &2 "$0: Removing idcache and $OUT/* ..." if [ -e $OUT ] ; then ( cd $OUT || exit $? e=$( find . -maxdepth 0 -empty ) if [ ${#e} = 0 ] ; then mkdir -p .old$$ || exit $? mv * .old$$/ rm -rf .old$$ & fi ) || exit $? fi rm -rf idcache mutt.cache break ;; esac done fi mkdir -p $OUT || exit $? ### SEP=$( echo "\237\227\217\377 " ) # something very unlikely SEP='X-IDALLEN-Envl-Frm' # something very unlikely MAXPERDIR=10000 # this many files, max, per output directory MAXMSGSPERFILE=10000 # this many msgs, max, per output file FORMAIL=" formail -D 2000000 idcache -c -z -p '' -R 'From ' '$SEP: ' -s " ## MAXMSGSPERFILE=1000 # this many msgs, max, per output file ## echo "DEBUG using smaller MAXMSGSPERFILE of $MAXMSGSPERFILE" # pass the file name and date through to Perl using a fake first msg for f do datesec=$( stat -c '%Y' "$f" ) || exit !? basename=$( basename "$f" ) echo "$SEP: $SEP: $basename $datesec" eval $FORMAIL <$f done | /usr/bin/time perl -we " use strict; use warnings; use integer; use Date::Parse; use IO::Handle; use POSIX qw(strftime); use Fcntl qw( :seek ); my \$SEP = '$SEP'; my \$MAXPERDIR = $MAXPERDIR; my \$MAXMSGPERFILE = $MAXMSGSPERFILE; my \$FORMAIL = \"$FORMAIL\"; my \$OUT =\"$OUT\"; "' my $sep = lc $SEP; my $SEPHDR = "$SEP: "; my $SEPLEN = length($SEPHDR); my $file; my $auxinput; my $filename; my $prevfilename; my $prevdatesec; my %headers = (); my $in_headers = 0; my $headerlinecount = 0; my $bodylinecount = 0; my $prevbodylinecount = 0; my $msgcount = 0; my $filemsgcount = 0; my $skipmsg = 0; my $fromsec; # from From_ line my $fromstr; # from From_ line my $datesec; # from Date: line my $recvsec; # from Received: lines my $fixedformattell; my $prevfixedformattell; my $starthdrtell; my $prevstarthdrtell; my $startbodytell; my $prevstartbodytell; my $contentlength; my $internalfromtell; my $internalbodylinecount; my $lines; # from Lines: header my $prevlines; my $FILENAME; my $NOW = time; my $FIXEDFORMAT = "Lines: %-9d\nContent-Length: %-9d\n"; # 2,000,000 messages into 200 mailboxes means 10,000 msgs per box ## my $MAXFILESIZE = 1024 * 1024; # not used # dates for "idallen" domains my $D1980 = str2time("1 jan 1980"); # @cgl.uwaterloo.ca to Feb 19 1993 my $D1993 = str2time("19 feb 1993"); # @ncf.ca (end of CGL) my $D2002 = str2time("2002/05/29"); # @idallen.ca my $DATESEC; my $DATESTR; my @tmplist = ( $sep, qw( from to apparently-to resent-to cc bcc date ) ); my %want; foreach my $k ( @tmplist ) { $want{$k} = undef; # only the key has to exist } @tmplist = ( qw( Action Mailer Organization Personal_Name Precedence Status Subject Thread-Topic To X-ASG-Orig-Subj X-FreePort-Flags X-HELO-From X-Keywords X-MS-Has-Attach X-MS-TNEF-Correlator X-Mailer X-Originating-IP X-PMFLAGS X-Spam-Level X-UIDL ) ); my %reject; foreach my $k ( @tmplist ) { $reject{$k} = undef; # only the key has to exist } die "$0: found \"cur\" subirectory - QUIT\n" if -d "cur"; my $save = undef; my $aux; for(;;){ # we may need to read some aux input before continuing with STDIN if ( $auxinput ) { if ( eof($aux) || ! defined($_=<$aux>) ) { close($aux); wait; $aux = $auxinput = undef; $_ = $save; $save = undef; goto CONTINUE; } } else { $_ = <>; last if ! defined($_); } if ( substr($_,0,$SEPLEN) eq $SEPHDR ){ $skipmsg = 0; # if we were skipping a message, stop if ( defined($file) ){ # Flush prints $_ and returns false if not starting new msg &Flush($_) or next; # if we found a hidden message, do it before the next msg if ( $auxinput ) { die if $save; # can not do more than one aux input # save our state; when done hidden msg, return to CONTINUE $save = $_; my $pid = open($aux, "-|"); die "$0: cannot fork: $!\n " unless defined $pid; if ( $pid ) { $_ = <$aux>; # read first header line of aux input if ( ! defined($_) ) { print " Empty first line of input from formail\n"; $auxinput =~ s/\n\n.*/\n/s; $auxinput =~ s/\n/\n AUX /g; $auxinput =~ s/ AUX $//; print " AUX $auxinput"; next; # will see eof and restore $save } } else { # child sends aux input into $FORMAIL # $SIG{PIPE} = "IGNORE"; open(FM, "|$FORMAIL") or die "$0: Cannot open $FORMAIL: $!\n "; print FM $auxinput; close FM or die "$0: status $! $FORMAIL\n "; exit; # forked() child also flushes buffers! } } } CONTINUE: # resume here after restoring $_ from $save # pick up a fake first msg containing input filename and date if ( substr($_,$SEPLEN,$SEPLEN) eq $SEPHDR ){ ($FILENAME,$DATESEC) = /^$SEP: $SEP: (\S+) (\d+)$/; die "$0: Cannot parse special header: $_" unless $DATESEC; $DATESTR = &MailDate($DATESEC); print "=== Input $FILENAME dated $DATESTR\n"; $skipmsg = 1; $prevdatesec = $DATESEC; # for msgs with bad Date: lines next; } &StartNewMsg($_); # emit From_ line and start msg header next; } next if $skipmsg; if ( $in_headers ) { if ( $_ eq "\n" ) { $in_headers = 0; &CheckHeaders(); } else { ++$headerlinecount; # replace tabs and remove trailing whitespace in the headers s/ *\t+ */ /g; s/[ \t]+$//; # do not use \s because it removes trailing \n my($header,$char) = /^(\S+):\s*(\S)?/; if ( ! defined($char) ) { if ( ! defined($header) ) { die "$0: Cannot parse header: $_ "; } next if exists($reject{$header}); print " Empty header: $_"; } else { $header = lc $header; if ( $header eq "received" ) { # find the earliest "received" date # UCletter/digit after semicolon up to EOL or paren # insist on a time zone, to skip uncertain times my ($rcvdate) = /;\s*([0-9A-Z][^;()\n]{5,50})/; my ($fussy) = /;\s*([0-9A-Z][^;()\n]{5,50}\s+(?:[-+]\d{3,4}|UT|[-+\w]{3,9}))(?:\s*$|\s)/; # my ($fussy,$f2) = /;\s*([0-9A-Z][^;()\n]{5,50}\s+(([-+]\d{3,4}|[-+\w]{3,9})))\s*(?:$|\s)/; # if ( defined($f2) ){ # print "DEBUG \"$fussy\" and \"$f2\"\n"; # } my $sec; # some machines are just wrong # go back one week and forward one day, max # 604800 sec = 7 days if ( $rcvdate && length($rcvdate) >= 6 && ($sec=str2time($rcvdate)) && $sec > ($fromsec-604800) && $sec < ($fromsec+86400) ){ if ( ! defined($fussy) ) { my $fix = $rcvdate; $fix =~ s/\sGMT([-+]\d+):/$1/; $fix =~ s/(\d)\s([+-]\d)/$1 ${2}000/; # XXX ($fussy) = /;\s*([0-9A-Z][^;()\n]{5,50}\s+(?:[-+]\d{3,4}|UT|[-+\w]{3,9}))(?:\s*$|\s)/; print " No time zone: \"$rcvdate\"\n"; } elsif ( ! defined($recvsec) ) { $recvsec = $sec; } elsif ( $sec < $recvsec ) { if ( ($recvsec-$sec) > (2*86400) ){ my $d1 = &MailDate($sec); my $d2 = &MailDate($recvsec); print " Received: $d1 <++ $d2\n"; } $recvsec = $sec; } } } elsif ( $header eq "date" ) { my ($userdate) = /^\S+:\s*(.+)$/; my $str; $datesec = &FixDate($userdate); # find the earliest date to check against my $minsec = $fromsec; if ( $recvsec && $recvsec < $minsec ) { $minsec = $recvsec; } # some people do not set their computer clocks well # allow 1/2 hour into future and less than a day past # 86400 sec == 1 day 1800 sec = 1/2 hr if ( $datesec > ($minsec+1800) || $datesec < ($minsec-80000) ){ $str = &MailDate($minsec); my $d2 = &MailDate($datesec); print " Header $str used instead of Date: $d2\n"; $datesec = $minsec; } else { $str = &MailDate($datesec); } if ( &SameDate($str,$userdate) ) { $_ = "Date: $str\n"; } else { print $file "Date: $str\n"; $_ = "X-$_"; } } elsif ( $header eq "from" ) { my ($from) = /^\S+:[ \t]*(.+)$/; my $new = &FixFrom($from); # only put out X-From if $new is really different if ( substr($new,0,length($from)) ne $from ) { print $file "From: $new\n"; $_ = "X-$_"; } } elsif ( $header eq "content-length" ) { ($contentlength) = /^\S+:[ \t]*(\d+)/; if ( ! defined($contentlength) || $contentlength < 0 ) { print " Invalid $_"; } next; } elsif ( $header eq "lines" ) { ($lines) = /^\S+:[ \t]*(\d+)/; if ( ! defined($lines) || $lines < 0 ) { print " Invalid $_"; } next; } elsif ( $header eq "status" || $header eq "x-uid" ) { next; } if ( exists($want{$header}) ) { ($headers{$header}) = /^\S+:[ \t]*(.+)$/; } } } } else { if ( $contentlength > 100 && substr($_,0,5) eq "From " ) { $internalfromtell = tell($file); $internalbodylinecount = $bodylinecount; } ++$bodylinecount; } print $file $_; } if ( defined($file) ){ &Flush(undef); } print "Found $msgcount unique messages\n"; exit; # turn seconds into an RFC822/2822/5322 date format sub MailDate { return strftime("%a, %d %b %Y %H:%M:%S %z",localtime($_[0])); } sub SameDate { my ($d1,$d2) = @_; $d1 =~ s/^\s+|\s+$|,//g; $d1 =~ s/ 0/ /g; $d1 =~ s/ +/ /g; $d2 =~ s/^\s+|\s+$|,//g; $d2 =~ s/ 0/ /g; $d2 =~ s/ +/ /g; return $d1 eq $d2; } # called at the end of the headers to tidy things up before the body sub CheckHeaders { # see if we can guess a missing To: line based on date if ( ! $headers{"to"} && ! $headers{"cc"} ) { my $to; if ( ! ($to=$headers{"apparently-to"}) ) { if ( ! ($to=$headers{"resent-to"}) ) { if ( ! ($to=$headers{"bcc"}) ) { if ( $FILENAME !~ /outgoing/i ) { if ( $fromsec <= $D1993 ) { $to = "idallen\@cgl.uwaterloo.ca"; } elsif ( $fromsec <= $D2002 ) { $to = "idallen\@ncf.ca"; } else { $to = "idallen\@idallen.ca"; } } else { # Cannot find a valid "To:" in the outgoing header; # assume this message is part of previous message # flush headers, rewind, append them to prev message # (may require re-opening previous output file) CopyAppend(); return; } } } } print $file "To: $to\n"; print " To: $to\n"; } if ( ! $headers{"date"} ) { # use the From_ or Received: line date my $str; if ( $recvsec && $recvsec < $fromsec ) { $datesec = $recvsec; $str = &MailDate($datesec); } else { $datesec = $fromsec; $str = $fromstr; } print $file "Date: $str\n"; # print " Date: $str\n"; } if ( ! $headers{"from"} ) { # use the envelope from line my ($from) = $headers{$sep} =~ /^(\S+)/; die "$0: cannot parse envelope From_: $headers{$sep}\n " unless $from; my $new = &FixFrom($from); print $file "From: $new\n"; # print " From: $new\n"; # if ( $new ne $from ) { # print $file "X-From: $from\n"; # } } # remember this place to rewrite later when we know the true size $fixedformattell = tell($file); $contentlength = -1 unless $contentlength; $lines = -1 unless $lines; printf $file $FIXEDFORMAT, $lines, $contentlength; $startbodytell = tell($file) + 2; # body starts here } sub Flush { my $from = $_[0]; if ( $headerlinecount == 0 ) { print " Redundant Flush skipped for msg $msgcount\n"; return 1; # successful flush was already done } --$bodylinecount if $bodylinecount > 0; # do not count last newline # calculate the body length by diff between tell() positions # if the calculated length or lines differs from stated, fix them my $endbodytell = tell($file); my $bodylength = $endbodytell - $startbodytell; my $conlendiff = ($contentlength > 0) ? $bodylength - $contentlength : -1; # actual vs stated my $linediff = ($lines > 0) ? $bodylinecount - $lines : -1; # actual vs stated if ( $conlendiff > 1 || $conlendiff < 0 || $linediff > 1 || $linediff < 0 ) { if ( $conlendiff > 10 && $internalfromtell ) { # if the actual vs stated diff is not small, something is wrong # mis-calculated content length? we have a hidden message! my $internalbodylength = $internalfromtell - $startbodytell; my $diff = $contentlength - $internalbodylength; print " Internal From_ found with Content-Length diff $diff\n"; # read the hidden message into $auxinput if ( ! $file->flush ) { die "$0: Unable to flush \"$filename\": $!\n "; } seek($file,$internalfromtell,SEEK_SET) or die "$0: Cannot seek to $internalfromtell: $!\n "; read($file,$auxinput,$bodylength-$internalbodylength+10) or die "$0: cannot read $bodylength-$internalbodylength+10 from $filename: $!\n "; truncate $file,$internalfromtell; # truncate to end of this msg # fix the lines and length for the current msg $bodylinecount = $internalbodylinecount; $bodylength = $internalbodylength; $linediff = $bodylinecount - $lines; # actual vs stated $conlendiff = $bodylength - $contentlength; # actual vs stated ## my $tmp = $auxinput; ## $tmp =~ s/\n/\n DEBUG /g; ## $tmp =~ s/ DEBUG $//; ## print " DEBUGAUX $tmp"; } elsif ( $conlendiff < -200 ) { # this msg continues on to next msg; can happen if the # previous msg was "eaten" by formail by a too-large # Content-Length, and was found and resurrected via $auxinput, # and the resurrected msg has a Content-Length that should # have eaten this message; do not start a new msg $from =~ s/^$SEPHDR/From /; # restore the From_ line print " Continues: Content-Length vs FTell $contentlength $bodylength $conlendiff\n"; print " Continues: $from"; print $file $from; return 0; # not a real flush } print " Lines vs Count $lines $bodylinecount $linediff\n" if $lines >= 0 && $linediff; print " Content-Length vs FTell $contentlength $bodylength $conlendiff\n" if $contentlength >= 0 && $conlendiff; seek($file,$fixedformattell,SEEK_SET) or die "$0: Cannot seek to $fixedformattell: $!\n "; printf $file $FIXEDFORMAT, $bodylinecount, $bodylength; seek($file,0,SEEK_END) or die "$0: Cannot seek back to 0: $!\n "; } print "$msgcount $datesec hdr $headerlinecount body $bodylinecount len $bodylength\n"; $prevdatesec = $datesec; $prevstartbodytell = $startbodytell; $prevstarthdrtell = $starthdrtell; $prevfixedformattell = $fixedformattell; $prevbodylinecount = $bodylinecount; $prevlines = $lines; %headers = (); $headerlinecount = 0; $internalfromtell = undef; $internalbodylinecount = undef; # if we exceed our output file size, close off this file # if ( $endbodytell >= $MAXFILESIZE ) { if ( $filemsgcount >= $MAXMSGPERFILE ) { if ( ! $file->flush ) { die "$0: Unable to flush \"$filename\": $!\n "; } utime($datesec,$datesec,$file); if ( ! close($file) ) { die "$0: Unable to close \"$filename\": $!\n "; } print "=== Output Close $filename tell $endbodytell\n"; $prevfilename = $filename; $file = undef; $filename = undef; $filemsgcount = undef; } return 1; # successful flush } sub FixFrom { local $_ = $_[0]; return $_ if /[^ ]\@[^ ]+\.[^ ][^ ]/; return "$_\@ncf.ca" if /^[a-z][a-z][0-9][0-9][0-9]$/; if ( /^\w+$/ ) { # bare word: add a domain to a bare userid e.g. "idallen" my $to = $headers{"to"} || $headers{"apparently-to"} || $headers{"resent-to"}; if ( defined($to) && $to =~ /waterloo/ ) { return "$_\@cgl.uwaterloo.ca"; } elsif ( defined($to) && $to =~ /freenet|ncf\.ca/ ) { return "$_\@ncf.ca"; } elsif ( $fromsec <= $D1993 ) { return "$_\@cgl.uwaterloo.ca"; } elsif ( $fromsec <= $D2002 ) { return "$_\@ncf.ca"; } else { return "$_\@idallen.ca"; } } # (?=pattern) - man perlre - zero width positive look-ahead assertion if ( /\bidallen\@(?:janlinux|oak|elm)(?:$|[^\w.])/ ) { s/\bidallen\@(?:janlinux|oak|elm)(?=$|[^\w.])/idallen\@ncf.ca/ if /NCFreeNet/; s/\bidallen\@(?:janlinux|oak|elm)(?=$|[^\w.])/idallen\@idallen.ca/; } return $_; } # take a string and return reasonable seconds sub FixDate { local $_ = my $str = $_[0]; my $parsed; while ( ! defined($parsed=str2time($_)) ) { next if s/Atlantic Daylight Time/-0300/; next if s/Atlantic Standard Time/-0400/; next if s/Eastern Daylight Time/-0400/; next if s/Eastern Standard Time/-0500/; next if s/Central Daylight Time/-0500/; next if s/Central Standard Time/-0600/; next if s/Mountain Daylight Time/-0600/; next if s/Mountain Standard Time/-0700/; next if s/Pacific Daylight Time/-0700/; next if s/Pacific Standard Time/-0800/; next if s/E\. Europe Daylight Time/+0300/; next if s/E\. Europe Standard Time/+0200/; next if s/GMT Standard Time/+0000/; next if s/ PM P/ P/; next if s/\bIST\b/+0100/; next if s/\bNOVT\b//; next if s/\bH0600\b/-0300/; next if s/:(\d\d)-(\d\d\d\d)$/:$1 -$2/; # chop alphanumeric junk off the right side and try again next if s/ +[\w]+ *$// && length >= 6; # give up and use the date of the Received: or previous msg $parsed = $recvsec ? $recvsec : $prevdatesec; last; } continue { print " Parsing \"$str\" failed - trying \"$_\"\n"; } # if the date is unreasonable, use the date of the previous msg if ( $parsed < $D1980 ) { my $sec = $recvsec ? $recvsec : ( $prevdatesec ? $prevdatesec : $DATESEC ); my $pd = &MailDate($sec); print " Parsed date $str < Jan 1980 - fixed to $pd\n"; $parsed = $sec; } elsif ( $parsed > ($DATESEC+86400) ) { # allow one day (60*60*24=86400) of slop my $sec = $recvsec ? $recvsec : ( $prevdatesec ? $prevdatesec : $DATESEC ); my $pd = &MailDate($sec); print " Parsed date $str > file $DATESTR - fixed to $pd\n"; $parsed = $sec; } return $parsed; } # Copy/Append the headers from the open $file to the previous message # - flush output into current message # - rewind to start of current message # - if current message starts at top of file: # - open old file for appending # - copy most headers onto end of old file # - close new file and unlink it # - put old file descriptor in to new file descriptor $file # - copy old file name to new file $filename # - continue on writing $file as if nothing had happened # - else sub CopyAppend { local $_; # this is necessary for the while(<>) handle my $linecount = 0; if ( ! $file->flush ) { die "$0: Unable to flush \"$filename\": $!\n "; } seek($file,$starthdrtell,SEEK_SET) or die "$0: Cannot seek to $starthdrtell in $filename: $!\n "; if ( $starthdrtell == 0 ) { # we just started a new file and have to add this msg to # the end of the previous file and previous msg, fixing # the Lines: and Content-Length: fields there print " Rewinding and moving content to $prevfilename\n"; my $oldfd; open($oldfd,">>",$prevfilename) or die "$0: Cannot append to \"$prevfilename\": $!\n "; while ( <$file> ) { s/^$SEPHDR/From /; # restore the From_ line next if /^X-IDALLEN/; # remove my added headers print $oldfd $_; ++$linecount; } # get rid of the file we just read (now empty) close($file) or die "$0: $!\n "; unlink($filename); # now pretend we were writing the old file all along $filename = $prevfilename; $file = $oldfd; $filemsgcount = $MAXMSGPERFILE; # but start a new file right away } else { # have to add this msg to the end of the previous msg, fixing # the Lines: and Content-Length: fields there print " Seeking and moving content to previous msg\n"; my $tmp = ""; open(MEMORY, ">", \$tmp) or die "$0: Cannot open in-memory fd: $!\n "; while ( <$file> ) { s/^$SEPHDR/From /; # restore the From_ line next if /^X-IDALLEN/; # remove my added headers print MEMORY $_; ++$linecount; } close(MEMORY) or die "$0: $!\n "; seek($file,$starthdrtell,SEEK_SET) or die "$0: Cannot seek in $filename: $!\n "; print $file $tmp; truncate $file,tell($file); # truncate to this length $tmp =~ s/\n/\n DEBUG /g; $tmp =~ s/ DEBUG $//; print " DEBUGCPY $tmp"; } $fromsec = $datesec = $recvsec = $prevdatesec; $fixedformattell = $prevfixedformattell; $startbodytell = $prevstartbodytell; $starthdrtell = $prevstarthdrtell; $lines = $prevlines; $bodylinecount = $prevbodylinecount + $linecount; $contentlength = -1; # ignore any previous Content-Length header --$msgcount; # we did not really start a new message --$filemsgcount; # we did not really start a new message } sub StartNewMsg { local $_ = $_[0]; # my $filename = sprintf( "%010d.%08d.%s", $NOW, ++$msgcount, $FILENAME ); ++$msgcount; if ( ! defined($file) ) { my $upper = int($msgcount / ( $MAXPERDIR * $MAXMSGPERFILE ) ); my $lower = $msgcount % ( $MAXPERDIR * $MAXMSGPERFILE); my $dirname = sprintf("%s/%04d", $OUT, $upper); $filename = sprintf("%s/%06d", $dirname, $lower); # truncate the file then open in read/write mode while ( ! open($file,"+>",$filename) ) { # if the directory is missing, create it next if mkdir("$dirname"); die "$0: Unable to open new \"$filename\": $!\n "; } print "=== Output Open $filename\n"; $filemsgcount = 0; } ++$filemsgcount; # "bounce-MusicNewsletter 5-idallen=freenet.carleton.ca"@announce2.mp3.com Tue Sep 28 14:11:55 1999 # idallen Sun Feb 18 00:10:49 1990 remote from watcgl # "\"janes56314@texas.net\" Bcc": Sat Oct 2 16:30:16 1999 my ($date) = /^\S+\s+(?:"(?:\\"|[^"])+"\S+|[^\s"]+)\s+(.*?)(?: remote from .*)?$/; $fromsec = &FixDate($date); $fromstr = &MailDate($fromsec); $starthdrtell = tell($file); $in_headers = 1; $headerlinecount = 1; $bodylinecount = 0; $datesec = undef; $recvsec = undef; $fixedformattell = undef; $startbodytell = undef; $contentlength = undef; $lines = undef; ($headers{$sep}) = /^\S+:[ \t]*(.+)$/; substr($_,0,$SEPLEN) = "From "; print $file "$_"; print $file "X-IDALLEN-Mbox-File: $FILENAME\n"; print $file "X-IDALLEN-Mbox-Date: $DATESEC $DATESTR\n"; print $file "X-IDALLEN-Envl-File: $filename $msgcount $filemsgcount\n"; print $file "X-IDALLEN-Envl-Date: $fromsec $fromstr\n"; } '