#! /usr/bin/perl

#   	    	    	    Bulk Validator
#   	    	 by John Walker  --  http://www.fourmilab.ch/

#   	This program submits HTML files from a directory or
#   	directory tree to the W3C HTML validator and reports
#   	the results.  For complete documentation, process this
#   	program with "perldoc" or invoke it with the --man
#   	option.

    use strict;
    use warnings;
    
    use LWP;
    use URI::Escape;
    use Pod::Usage;    
    use Data::Dumper;

    #	Our version number, release date, and identification
    #	    DID YOU REMEMBER TO UPDATE THESE IN THE DOCUMENTATION
    #	    AT THE BOTTOM?
    my $version = '1.1';
    my $reldate = '2007-09-11';
    
    #	This is who we are
    my $progname = 'BulkValidator';
    my $homepage = 'http://www.fourmilab.ch/webtools/BulkValidator';
    
    #	$Density specifies the density of the random sampling as
    #	a percentage from 0 to 100.
    my $Density = 100;
    
    #	$Firstfiles specifies the number of initial files to validate
    #	regardless of the sampling density because they're atypical and
    #	prone to contain infrequently encountered pratfalls.
    my $Firstfiles = 0;
    
    #	If $shuffle is nonzero, the list of files is shuffled into a
    #	random order before validation commences.
    my $shuffle = 0;
    
    #	After each validation, the program will sleep for an interval
    #	equal to $pause seconds.  If $pause is set to 0, no delay will
    #	occur between requests.
    my $pause = 15;
    
    #	If $pause is nonzero, a random interval between zero
    #	and ($pause * $rpause) will be added to the delay
    #	between requests.
    my $rpause = 1;
    
    #	If $verbose is nonzero, progress information will be printed
    #	on standard error.
    my $verbose = 0;
    
    my $skipfile = '';

    #	Directory containing files to be validated
    my $directory = ".";
    
    #	Recursively validate files in subdirectories ?
    my $tree = 0;
    
    #	Directory where discrepancy reports will be filed
    my $discrepancy = 'ValidationDiscrepancies';
   
    #	URL of w3.org HTML validator
    my $validator = "http://validator.w3.org/check";
    
    #	Process command line options and arguments
    
    use Getopt::Long;

    GetOptions(
                'copyright' => sub { print("This program is in the public domain.\n"); exit(0); },
		'density=i' => \$Density,
		'discrepancy=s' => \$discrepancy,
		'firstfiles=i' => \$Firstfiles,
                'help' => sub { &print_command_line_help; exit(0); },
		'man' => sub { pod2usage({-exitval => 0, -verbose => 2, -output => \*STDOUT}) },
		'pause=i' => \$pause,
		'rpause=f' => \$rpause,
		'shuffle' => \$shuffle,
		'skipfile=s' => \$skipfile,
		'tree' => \$tree,
		'validator=s' => \$validator,
                'verbose' => \$verbose,
                'version' => sub { print("Version $version, $reldate\n"); exit(0); }
              );
	      
    die("--density must be greater than zero") if ($Density <= 0);
    $Firstfiles = abs($Firstfiles);
    $pause = abs($pause);
    $rpause = abs($rpause);

    if ($#ARGV >= 0) {
    	$directory = shift(@ARGV);
	$directory =~ s:/+$::;
    }
    
    #	If a --skipfile has been specified, read it and parse lines
    #	from previous run(s) which indicate successful validation of
    #	files.  These files will be added to the %already hash, and
    #	thereby excluded from validation in this run.
    
    my %already;
    if ($skipfile ne '') {
    	open(SK, "<$skipfile") || die("Unable to open --skipfile $skipfile");
	my $f;
	while ($f = <SK>) {
	    $f =~ s/\s+$//;
	    if ($f =~ m/(^.*) validated\. \d\d:\d\d/) {
	    	$already{$1} = 1;
		print(STDERR "Excluding previously validated file $1\n") if $verbose;
	    }
	}
	close(SK);
    }
    
    #	Prepare the list of files to be validated.  If the --tree option is
    #	specified, we open a pipe to the "find" command and have it
    #	enumerate the candidate files.  Otherwise, we simply open the
    #	directory and look for HTML files ourself.
   
    my @files;
    if ($tree) {
    	print(STDERR "Traversing directory tree $directory\n") if $verbose;
    	open(DI, "find $directory -type f -name \\*.htm\\* -print |") ||
    	    die("Cannot create pipe to traverse contents of $directory");
	my $f;
	foreach $f (<DI>) {
	    $f =~ s/\s+$//;
	    if ($f =~ m/\.html?$/ && (-r $f)) {		
		push(@files, $f);
	    }
	}
	close(DI);
    } else {
    	print(STDERR "Opening directory $directory\n") if $verbose;
    	opendir(DI, $directory) || die("Cannot open directory $directory");
	my $f;
	foreach $f (grep(/\.html?$/, readdir(DI))) {
	    if ((-f "$directory/$f") && (-r "$directory/$f")) {
	    	push(@files, "$directory/$f");
	    }
	}
	closedir(DI);
    }
    
    print(STDERR (scalar @files), " file", ((scalar @files) == 1) ? "" : "s", " found\n") if $verbose;
    
    @files = sort(@files);
    
    if ($shuffle) {
    	my @sf = @files;
	undef(@files);
	while (@sf) {
	    push(@files, splice(@sf, rand(@sf), 1));
	}
    }
    
    my $filename;
    my $fnum = 0;
    
    my $browser = LWP::UserAgent->new();
    push(@{$browser->requests_redirectable}, 'POST');
    $browser->agent("$progname ($homepage Ver. $version)");
    
    #	Set STDOUT unconditionally line buffered so that if we're
    #	killed it will reflect everything we've validated so far.
    $| = 1;
    
    while ($filename = shift(@files)) {
    	if ($already{$filename}) {
	    print(STDERR "Skipping previously validated $filename\n") if $verbose;
	    next;
	}
    	if ($filename =~ m/\.html?$/) {
    	    $fnum++;
	    
	    if (($fnum <= $Firstfiles) || (rand(100) <= $Density)) {
    	    	print(STDERR "Validating $filename\n") if $verbose;	    
    		my $response = $browser->post(
		    	    	$validator, 	    # URL of validator
				[
				    'uploaded_file' => [ "$filename" ]
				],
				'Content_Type' => 'form-data'
		    	    	    	    );
    	    	if (!$response->is_success) {
    	    	    print(STDERR Dumper($response)) if $verbose;
		    die("Error contacting validator: ", $response->status_line);
		}
		my $results = $response->content;
    		my ($sec, $min, $hour) = localtime(time());
		my $time = sprintf("%02d:%02d", $hour, $min);
		my $vfname = $filename;
		$vfname =~ s:/:_:g;
#		if ($results =~ m:<h2 class="valid">This Page Is Valid X?HTML \d+\.\d+[^<]*</h2>:) {
		if ($results =~ m:<h2\s+class="valid">This\s+Page\s+Is\s+Valid\s+X?HTML\s+\d+\.\d+[^<]*</h2>:) {
    		    print("$filename validated. $time\n");
		    #	If there was a discrepancy report for this file from a
		    #	previous failed validation attempt, remove it.
		    if (-f "$discrepancy/$vfname") {
		    	unlink("$discrepancy/$vfname");
			print(STDERR "Removed discrepancy report $discrepancy/$vfname\n");
			#   Try removing the discrepancy directory.  If it is not
			#   empty, this will silently fail.
			if (rmdir($discrepancy)) {
			    print(STDERR "Removed discrepancy report directory $discrepancy\n") if $verbose;
			}
		    }
		} else {
    		    print("** $filename failed validation. $time\n");
		    if (!(-d $discrepancy)) {
		    	print(STDERR "Creating discrepancy report directory $discrepancy\n") if $verbose;
			mkdir($discrepancy) || die("Cannot create discrepancy report directory $discrepancy");
		    }
    		    open(FO, ">$discrepancy/$vfname") || die("Cannot create $discrepancy/$vfname");
		    print(FO $results);
		    close(FO);
		}
    	    	if (@files && ($pause > 0)) {
		    my $delay = $pause + int(rand($pause * $rpause));
    	    	    print(STDERR "Sleeping for $delay seconds.\n") if $verbose;
		    sleep($delay);
		}
	    }
	}
    }
    
    #	Print command line help
    
    sub print_command_line_help {
        print << "EOD";
Usage: $progname.pl [ options ] [ directory ]
       Options:
             --copyright     Print copyright information
	     --density n     Validate n percent of files (default 100)
	     --discrepancy d File validation discrepancy reports in directory d (default $discrepancy)
	     --firstfiles n  Validate the first n files regardless of --density (default 0)
             --help          Print this message
	     --man  	     Print manual page for this program
	     --pause n	     Pause n seconds after each validation request (default 15)
	     --rpause n      Randomly increase --pause up to this factor (default 1)
	     --shuffle	     Validate files in random order
	     --skipfile f    Skip files already validated in output from a
    	    	    	     previous run's output in file f
	     --tree 	     Recursively validate files in subdirectories
	     --validator u   Use u as validator URL instead of $validator
             --verbose       Print verbose debugging information
             --version       Print version number
Version $version , $reldate.
The latest version of this program is always
available from: $homepage
EOD
    }

=head1 NAME

BulkValidator - Validate a collection of HTML/XHTML pages

=head1 SYNOPSIS

B<BulkValidator>
[B<--copyright>]
[B<--density> I<num>]
[B<--discrepancy> I<dir>]
[B<--firstfiles> I<num>]
[B<--help>]
[B<--man>]
[B<--pause> I<num>]
[B<--rpause> I<factor>]
[B<--shuffle>]
[B<--skipfile> I<num>]
[B<--tree>]
[B<--validator> I<url>]
[B<--verbose>]
[B<--version>]
[I<directory>]

=head1 DESCRIPTION

B<BulkValidator> submits all of the HTML/XHTML files either in a
specified directory (the current directory is assumed if none
is given) or in that directory and any subdirectories
to the W3C HTML validator and reports the results.  The
validation reports for any files which failed validation
are saved for review.

=head1 OPTIONS

All options may be abbreviated to their shortest
unambiguous prefix.

=over 5

=item B<--copyright>

Display copyright information.

=item B<--density> I<num>

A randomly chosen subset of I<num> percent of the files will
be validated.  If you have a large collection of mostly similar
files and do not want to spend the time or burden the validator
with processing them all, specify a modest percentage of the files
to test a sample of them.  You can use the
B<--firstfiles> option if you wish to unconditionally validate
some number of the first files in the list.  If no B<--density>
is specified, all files will be validated (equivalent to a
I<num> specification of 100).

=item B<--discrepancy> I<dir>

The validation reports for any files which failed validation
will be stored in the directory I<dir>, which will be created
if it does not already exist.  If no B<--discrepancy> directory
is specified, reports will be stored in a C<ValidationDiscrepancies>
directory created within the current directory.

=item B<--firstfiles> I<num>

The first I<num> files
will always be validated regardless of the
B<--density> specification.  The default is
0, which causes no files to be unconditionally validated.

=item B<--help>

Display how to call information.

=item B<--man>

Display this complete manual page.

=item B<--pause> I<num>

After each file is validated, B<BulkValidator> will pause for
I<num> seconds (plus an additional pause as specified by
B<--rpause>, see below).  The default is 15 seconds.  A modest
delay after each request avoids unduly burdening the
W3C Validator.

=item B<--rpause> I<factor>

If B<--pause> is nonzero, a random increment from zero to
the B<--rpause> I<factor> multiplied by the B<--pause> I<num>
will be added to the delay after each request.  The
I<factor> is a floating point number; the default is 1, which
results in a delay between the B<--pause> specification and
twice that value.

=item B<--shuffle>

If specified, files will be validated in random order.  If not
specified, files are validated in alphabetical order.

=item B<--skipfile> I<file>

The specified I<file> is the output from one or more previous runs of
B<BulkValidator> (which you can capture by redirecting standard output
to a file or piping it to B<tee>).  All files which passed validation in
previous runs will be skipped this time.  Use this option when you're
chasing down the validation errors in a collection of files;
only the files which failed the last time will be re-examined
on this run.

=item B<--tree>

All C<.html> and C<.htm> files in subdirectories recursively
traversed starting at the I<directory> specified on the
command line will be validated.  

=item B<--validator> I<url>

The specified I<url> is used to request validation instead
of the default C<http://validator.w3.org/check>.  The
validator must accept file uploads with the same form
fields as the W3C HTML validator and return pass/fail
results in the same syntax.

=item B<--verbose>

Generate verbose output to indicate what's going on.

=item B<--version>

Display version number.

=back

=head1 EXAMPLES

Validate all HTML files in the current directory, placing
discrepancy reports in a C<ValidationDiscrepancies>
subdirectory of the current directory.

    perl BulkValidator.pl

Validate the first 10 files in alphabetical order,
then 15% of the remaining files chosen at random from the
directory C</var/www/html/recipes/ratburger> and subdirectories,
placing discrepancy reports for any files which fail
validation in C</home/chef/goofs>.

    perl BulkValidator.pl --tree --firstfiles 10 --density 15 \
    	    	    	  --discrepancy /home/chef/goofs \
			  /var/www/html/recipes/ratburger

Validate files in C</var/www/html/recipes/ratburger>, saving the
pass/fail results in C</home/chef/goofs/val.log>.  Then, after
editing, revalidate all the files which failed to validate
the first time.

    perl BulkValidator.pl /var/www/html/recipes/ratburger \
    	    | tee /home/chef/goofs/val.log
    #	. . . Edit, edit, edit . . .
    perl BulkValidator.pl --skipfile /home/chef/goofs/val.log
    	    /var/www/html/recipes/ratburger

=head1 FILES

If no I<directory> is specified on the command line, the
current directory is validated.

The validation summary is written to standard output.  You can redirect
this to a file or make a copy with B<tee> if you wish to use it in
subsequent runs to exclude already-validated files with the
B<--skipfile> option.

The validator reports for any files which failed validation are
stored in the B<--discrepancy> directory, which defaults to
C<ValidationDiscrepancies> in the current directory.  Files in
this directory are named with the path name of the validated file,
with all slashes replaced by underscores.  Validation reports
for files which previously failed validation but passed this time
will be automatically deleted, and the B<--discrepancy> directory
will be removed if, at the end of the run, no files remain within it.

=head1 BUGS

Please report bugs to B<bugs@fourmilab.ch>, indicating the version
numbers of B<BulkValidator>, Perl, and the Perl LWP module installed
on your system.

=head1 AUTHOR

John Walker
(B<http://www.fourmilab.ch/>)

=head1 SEE ALSO

    tee
    http://validator.w3.org/
    http://www.perl.org/

=head1 VERSION

This is B<BulkValidator> version 1.0, released on February 4th, 2007.
The current version of this program is always posted at:

http://www.fourmilab.ch/http://www.fourmilab.ch/webtools/BulkValidator/

=head1 COPYRIGHT

This program is in the public domain.

=cut
