#! /usr/bin/perl

    #       	    	    Verify URL at a Web Site
    
    #	    	by John Walker (http://www.fourmilab.ch)  --  October 2006
    
    #	For details see:
    
    #	    http://www.fourmilab.ch/documents/corrupted_downloads/#Verifying
    
    #	This program is in the public domain.
    
    use strict;
    use warnings;

    #   Configuration
    
    my $DirectDigest = 1;   	    	    # Compute digests using Perl modules ?
    my $Site = 'www.fourmilab.ch';  	    # Site domain name
    my $IPaddress = qr/^193\.8\.230\.138$/; # Regular expression which matches IP addresses
    	    	    	    	    	    # for your site.  If you don't wish to permit
					    # URLs with IP addresses, define something which
					    # will never match.
    my $domainName = qr/^(\w+\.)?fourmilab\.\w+$/i;  # Regular expression which will match all
    	    	    	    	    	    # domain names which are valid for your site
    my $FTP_base = "/server/pub/$Site";     # Absolute path name for FTP tree or null if no FTP
    my $HTTP_base = "$FTP_base/web";	    # Absolute path name for HTTP tree
    my $logfile = "/tmp/VerifyURL.log";     # Absolute path of log file or null for none
    
    my $siteName = "Fourmilab";     	    # Public name of your site
    my $logoURL = "/images/logo/swlogo.png"; # URL to logo for your site
    
    #	Globals
    
     my $url = '(unspecified)';
         
    parse_query_string();
    
    if (!defined($ENV{WWW_url})) {
    	errorOut("No URL specified.");
    }
    
    $url = $ENV{WWW_url};
    my ($length, $md5, $sha1) = qw(? ? ?);
    
    my $purl = $url;
    my ($proto, $sitename, $path);
    if ($purl =~ s-^(\w+)://--) {
    	$proto = $1;
    } else {
    	errorOut("Invalid protocol specification (must be http:, https:, or ftp:)");
    }
    
    if ($proto !~ m/^(http|https|ftp)$/) {
    	errorOut("Unsupported protocol (must be http:, https:, or ftp:)");
    }
    
    if ($purl =~ s-^\s*([\w\.]+)\s*/--) {
    	$sitename = $1;
    } else {
    	errorOut("Invalid site name.");
    }
    
    if (!(($sitename =~ m/$IPaddress/) ||
    	  ($sitename =~ m/$domainName/))) {
    	errorOut("This URL specifies a non-$siteName Web site.");
    }
    
    #	Elide potential sandbox escaping items from path name.
    #	Note that we actually kill any request which contains a
    #	suspected attempt to breach security.  Why encourage 'em?
    
    $path = $purl;
    
    if ($path =~ s:[^\w/\-\.]::g) {
    	errorOut("Invalid character in URL.");
    }
    
    #	Note that some of the items explicitly checked for in the cases
    #	below should have already been excluded by the character set
    #	restriction test above.  You can't be too careful, and this
    #	provides extra protection for users who weaken the above test
    #	to support sites which use additional characters in URLs.
    
    if (($path =~ s/\.\.//g) ||     	# No parent directory specifications
    	($path =~ s://:/:g) ||	    	# No double slashes of any kind
	($path =~ s-^/+--) ||	    	# No leading slashes in path
	($path =~ s:/\.+::) ||	    	# No leading periods in file names
	($path =~ s:/\-+::) ||	    	# No leading hyphens in file names
	($path =~ s:[^\w/\-\.]::g)) {	# No characters except \w, hyphens, and periods
    	errorOut("Invalid URL.");
    }
   
    if ($path =~ m:^cgi\-bin/:) {
    	errorOut("Cannot verify dynamic (CGI) Web resources.");
    }
    
    if (($proto =~ m/ftp/) && ($FTP_base ne '')) {
    	$path = "$FTP_base/$path";
    } else {
    	$path = "$HTTP_base/$path";
    }
    
    $path =~ s:/$:/index\.html:;
    
    if (!(-f $path)) {
    	errorOut("File does not exist on the $siteName server.");
    }
    
    if (!(-r $path)) {
    	errorOut("File is not available for downloading.");
    }
    
    $length = -s $path;
    
    if ($DirectDigest) {
    	use Digest::MD5;
	use Digest::SHA1;
	
	my $cmd5 = Digest::MD5->new;;
	my $csha1 = Digest::SHA1->new;
	my $buf;
	open(FI, "<$path") || errorOut("Unable to open file for input.");
	$cmd5->addfile(*FI);
	seek(FI, 0, 0);
	$csha1->addfile(*FI);
	close(FI);
	$md5 = $cmd5->hexdigest ;
	$sha1 = $csha1->hexdigest;
    } else {
	$md5 = `md5sum $path`;
	$md5 =~ s/\s+$//;
	$md5 =~ s/\s+.*$//;

	$sha1 = `sha1sum $path`;
	$sha1 =~ s/\s+$//;
	$sha1 =~ s/\s+.*$//;
    }
   
    print <<"EOH";
Content-Type: text/html\r
\r
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>URL Verification</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
</head>
<body>
<h1><a href="http://$Site/"><img src="$logoURL"
    width="82" height="74"
    style="padding-right: 1em; border: 0px; vertical-align: middle;"
    alt="$siteName home" /></a>
    $siteName URL Verified</h1>
<hr />
<p>
The URL:
</p>

<pre>
    <a href="$url">$url</a>
</pre>

<p>
is present on the $siteName server and available for downloading.
</p>

<pre>
    File length:    <b>$length</b> bytes
    MD5 signature:  <b>$md5</b>
    SHA1 signature: <b>$sha1</b>
</pre>
</body>
</html>
EOH

    appendLogFile("OK $length");
	
    
    #	Parse the CGI QUERY_STRING argument (placed in the environment when
    #	the Web server launches this script).  This function sets environment
    #	variables named WWW_varname for each field in the form with value the
    #	unquoted contents of each field.  This rather odd way of returning
    #	results ("Why not just use a regular hash?") is used to maintain
    #	compatibility with the UNCGI program.  A Perl script which previously
    #	required execution under the control of UNCGI may be run stand-alone
    #	by calling this function at the top.  This halves the number of processes
    #	the server must fork in order to process the request, and is one
    #	fewer thing for the beleaguered system administrator to install.
    
    sub parse_query_string {
    	my ($rmeth, @args, $instream, $var, $varname, $value);
	
	$rmeth = $ENV{'REQUEST_METHOD'};    # Request method ("GET" or "POST")
	$rmeth =~ tr/a-z/A-Z/;	    	    # Just in case server doesn't shout
	if ($rmeth eq 'GET') {
	    @args = split(/&/, $ENV{'QUERY_STRING'});
	} elsif ($rmeth eq 'POST') {
	    read(STDIN, $instream, $ENV{'CONTENT_LENGTH'});
	    @args = split(/&/, $instream);
	} else {
	    die("Hotbits request invoked without valid REQUEST_METHOD");
	}
	foreach $var (@args) {
	    ($varname, $value) = split(/=/, $var);
	    $varname = unescape_URL($varname);
	    $value = unescape_URL($value);
	    #	Elide any server-side includes a mischevious user may have
	    #	embedded in a value string.  This protects against mayhem if
	    #	the value is included in an HTML reply.
	    $value =~ s/<!--(.|\n)*-->//g;
	    
	    #	Define an "uncgi" compatible environment variable for
	    #	each name/value pair.
	    
	    $ENV{"WWW_$varname"} = $value;
	}
    }
    
    #	Unescape a URL-encoded string,  There is no error checking for
    #	malformed "%xx" quoted sequences; if the percent sign isn't
    #	followed by two hex digits, the whole mess will be left in
    #	the string unchanged.
    
    sub unescape_URL {
    	my ($s) = @_;
	
	$s =~ tr/+/ /;
	$s =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
	
	return $s;
    }
    
    #	Escape a URL

    sub escape_URL {
    	my ($s) = @_;

    	$s =~ s/([^A-Za-z0-9\$\-_\.\!\*\(\)])/sprintf("%%%02X", ord($1))/eg;
	return $s;
    }
    
    #	The quoteHTML function quotes all HTML metacharacters
    #	in its argument and expands characters which are not Latin-1
    #	graphics to HTML numeric entities.  The quoted string is
    #	returned.

    
    sub quoteHTML {
        my ($s) = @_;

        my $os = '';

        while ($s =~ s/^(.)//) {
            my $o = ord($1);
            if (($1 eq '<') || ($1 eq '>') || ($1 eq '&') || ($1 eq '"') ||
                ($o < 32) ||
                (($o >= 127) && ($o < 161)) || ($o > 255)) {
                $os .= "&#$o;";
            } else {
                $os .= $1;
            }
        }
        return $os;
    }

    #	Append an entry to the log file.  The string argument is
    #	used as the request status.
    
    sub appendLogFile {
    	my ($status) = @_;

	if (defined($logfile) && ($logfile ne '')) {
	    open(FL, ">>$logfile") || die("Cannot append to $logfile");
	    my ($sec, $min, $hour, $mday, $mon, $year) = localtime(time());
	    my $dtime = sprintf("%04d-%02d-%02d %02d:%02d:%02d",
		$year + 1900, $mon + 1, $mday, $hour, $min, $sec);
	    print(FL $ENV{REMOTE_ADDR} . ' ' .
	    	    $dtime . ' "' .
		    quoteHTML($ENV{WWW_url}) . '" "' .
		    $status . '"' . "\n");
	    close(FL);
	}	
    }

    #   Generate error report if something goes wrong

    sub errorOut {
    	my ($message) = @_;
    	my $qurl = quoteHTML($url);
	
    	print <<"EOH";
Content-Type: text/html\r
\r
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>URL Verification Error</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
</head>
<body>
<h1>Error Verifying URL</h1>
<hr />
<p>
The following error:
</p>

<blockquote>
<p>
<b>$message</b>
</p>
</blockquote>

<p>
occurred while processing your URL verification request
for:
</p>

<pre>
    $qurl
</pre>
</body>
</html>
EOH
    	appendLogFile("ERR $message");
        exit(0);
    }
