#!/usr/public/bin/perl
# ---------------------------------------------------------------------------
$Version = 'oldlog2new-1.0';
#
# Copyright (c) 1994 Regents of the University of California.
# All rights reserved.
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the Arcadia project at the University of California, Irvine.
# It is based on the wwwstat log analyzer.  All this program does is read
# in the old log, figure out what each entry points to, find the current
# file size for that entity, and output the new format including a reasonable
# approximation of the server response code.
#
sub usage {
    die <<"EndUsage";
usage: oldlog2new [-hez] [-f logfile] [-s srmfile]

$Version
Convert an NCSA httpd 1.1 access_log file to a 1.2 access_log
Display Options:
     -h  Help -- just display this message and quit.
     -e  Display all invalid log entries on STDERR. (default is to ignore them)
Input Options:
     -f  Read from the following access_log file instead of the default.
     -z  Use zcat to uncompress the log file while reading [requires -f].
     -s  Get the server directives from the following srm.conf file.
EndUsage
}
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to the author Roy Fielding at <fielding@ics.uci.edu>.
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine.  The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.  THIS SOFTWARE IS PROVIDED ``AS
# IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#   
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#   
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ---------------------------------------------------------------------------
# Set the default configuration options:

# Edit the next line to specify the (+/-)HHMM offset from GMT

$GMToffset = '-0700';

# Edit the next line to identify the server's default home page.

$ServerHome = "/ICShome.html";

# Edit the next two lines to specify the location of your server access log
# and your server configuration (srm.conf) file.

$access_log = '/dc/ud/www/httpd_1.1/logs/access_log';
$srm_conf   = '/dc/ud/www/httpd_1.1/conf/srm.conf';

# Edit the next line to specify the command for displaying compressed files

$zcat = 'gunzip -c';       # specify as null string if none are available

# Estimate the size of a redirect message minus the two location URLs

$ScriptEstimate = 1000;    # in bytes (must be greater than zero)
$DirectoryRedirect = 1;    # Does server do automatic redirect for slashless
                           # index reqs? (1 for httpd_1.1,  0 for httpd_1.0)

# Is the server running with rfc931 support (IdentityCheck on)?

$IdentityCheck = 0;        # Must = 1 if server uses rfc931 remote ident.

# Edit the next few lines to specify whether (1) or not (0) you want:

$PrintInvalids     = 0;    # Display invalid log entries on STDERR?
$CompressedLog     = 0;    # Access log has been compressed (or gzipped)?

# ==========================================================================
# Get the command-line options

require "getopts.pl";
&Getopts('hezf:s:');
if ($@ || $opt_h) { &usage; }

if ($opt_e) { $PrintInvalids     = 1; }
if ($opt_z) {
   if ($opt_f) { $CompressedLog  = 1; }       # Require logfile name if
   else        { &usage; }                    # uncompression is desired
}
if ($opt_f) { $access_log    = $opt_f; }
if ($opt_s) { $srm_conf      = $opt_s; }

if ($CompressedLog && !$zcat) {
    die "No zcat decompression command has been defined, stopped";
}

# ==========================================================================
# Get the other needed configuration items from the srm.conf file

open (SRM,$srm_conf) || die "Error opening config file: $srm_conf\n";

$UserDir        = "public_html";              # Start with NCSA defaults
$DirectoryIndex = "index.html";
$DocumentRoot   = "/usr/local/etc/httpd/htdocs";

while (<SRM>)
{
    next if ( ($_ eq "\n") || /^\#/ ); # Ignore blank and comment lines

    if (/^DocumentRoot (.+)\s/)
    {
        $DocumentRoot = $1;
    }
    elsif (/^UserDir (.+)\s/)
    {
        $UserDir = $1;
    }
    elsif (/^DirectoryIndex (.+)\s/)
    {
        $DirectoryIndex = $1;
    }
    elsif (/^Redirect\s+(\S+)\s+(\S+)\s/)
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\\1/g;          # Needed for later pattern match
        $AllRedirects{$alias} = $rname;
    }
    elsif (/^Alias\s+(\S+)\s+(\S+)\s/)
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\\1/g;          # Needed for later pattern match
        $AllAliases{$alias} = $rname;
    }
    elsif ( /^ScriptAlias\s+(\S+)\s+(\S+)\s/ ||
            /^OldScriptAlias\s+(\S+)\s+(\S+)\s/ )
    {
        $alias = $1;
        $rname = $2;
        $alias =~ s/(\W)/\\\1/g;          # Needed for later pattern match
        $AllScripts{$alias} = $rname;
    }
}
close SRM;

# ==========================================================================
# Now read log, figure out the response code and bytes, and output new format
# 

if ($CompressedLog) { $access_log = "$zcat $access_log |"; }

open (LOG,$access_log) || die "Error opening access log file: $access_log\n";

LINE: while (<LOG>)
{
    $saveline = $_;

    $ident = "-";

    if ($IdentityCheck)           # Does log include IdentityCheck info?
    {
        /^(.*)@\S+\s/;
        if ($_)
        {
            $ident = $1;                   # Save ident for later use
            $saveline =~ s/^.*@//;         # Remove the remote ident from log
        }
        $_ = $saveline;
    }
    
    $htv = '';

    ($afield, $date, $method, $oname, $htv) =
         /^(\S+) \[(.+)\] (\S+)\s+(\S+)\s(.*)$/;

    if (!($afield && $date && $method && $oname && (length($date) == 24)))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    #
    # First, we have to figure out what file or script was accessed
    #

    $fname = $oname;

    $fname =~ s/\?.*$//;              # Remove any trailing query information
    $fname =~ s/\#.*$//;              # Remove any trailing anchor information
    $fname =~ s#//#/#g;               # Remove any extra slashes

    if (($fname eq "") || ($fname eq "HTTP/1.0"))
    {
        print(STDERR "$.:$saveline") if $PrintInvalids;
        next LINE;
    }

    FNAME:                        # Get the document's real name
    {
        $rname = "";                         # and start with it unknown
        $rcode = 200;
        $fsize = 0;
        study $fname;

        if (($fname eq "/") || ($fname eq "/$DirectoryIndex"))
        {
            $fname = "$ServerHome";          # Handle top file with extra care
            $rname = "$DocumentRoot$fname";
            last FNAME;
        }

        foreach $redir (keys(AllRedirects))  # Is it a redirected file?
        {
            if ( $fname =~ /^$redir/ )
            {
                $rcode = 302;
                last FNAME;
            }
        }

        foreach $alias (keys(AllAliases))    # Is it a file name alias?
        {
            if ( $fname =~ /^$alias/ )
            {
                $rname = $fname;
                $rname =~ s#^$alias#$AllAliases{$alias}#;
                last FNAME;
            }
        }

        if ($fname =~ /^\/~(\w+)\// )        # Is it a /~username/...?
        {
            ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell)
                = getpwnam($1);
            if ($dir)
            {
                $rname = $fname;
                $rname =~ s#^/~$1#$dir/$UserDir#;
            }
            else
            {
                $rcode = 404;
            }
            last FNAME;
        }

        if ($fname =~ /^\/~(\w+)$/ )         # Is it a /~username ?
        {
            ($name,$passwd,$uid,$gid,$quota,$comment,$gcos,$dir,$shell)
                = getpwnam($1);
            if ($dir)
            {
                $rname = $fname;
                $rname =~ s#^/~$1#$dir/$UserDir#;
                if (-e "$rname/$DirectoryIndex")
                {
                    if ($DirectoryRedirect)
                    {
                        $rcode = 302;
                        last FNAME;
                    }
                    $rname .= "/$DirectoryIndex";
                }
                $fname .= '/';
            }
            else
            {
                $rcode = 404;
            }
            last FNAME;
        }

        foreach $alias ( keys(AllScripts) )  # Is it a script directory alias?
        {
            if ( $fname =~ /^$alias/ )
            {
                $fsize = $ScriptEstimate;    # Estimate bytes from script
                last FNAME;
            }
        }

        if (-d "$DocumentRoot$fname")        # Is it a directory?
        {
            $hasSlash = ($fname =~ s/\/$//); # Remove any trailing slash
            if (-e "$DocumentRoot$fname/$DirectoryIndex")
            {
                if (!$hasSlash && $DirectoryRedirect)
                {
                    $rcode = 302;
                    last FNAME;
                }
                $rname = "$DocumentRoot$fname/$DirectoryIndex";
            }
            else
            {
                $rname = "$DocumentRoot$fname";
            }
            $fname .= '/';
            last FNAME;
        }

        $rname = "$DocumentRoot$fname";      # It must be a normal file

    } # end FNAME

    $xname = 0;

    if (!$fsize && ($rcode == 200) && $rname) # Get the file size
    {                                         # through use of a cache of Sizes
        ($fsize = $Sizes{$fname}) ||
        ($fsize = $Sizes{$fname} = (-s $rname)) ||
        ($xname = 1);
    }

    if ($xname) { $rcode = 404; }

    if (!(($method eq 'GET')||($method eq 'HEAD')||($method eq 'POST')))
    {
        $rcode = 400;
    }

    if    ($rcode  != 200)    { $fsize = '-'; }
    elsif ($method eq 'HEAD') { $fsize =   0; }

    if ($htv) { $oname .= ' '. $htv; }

    #
    # Phew!  Now we have to swap the date format around
    #

    $newdate = substr($date, 8, 2) .'/'.
               substr($date, 4, 3) .'/'.
               substr($date,20, 4) .':'.
               substr($date,11, 9) . $GMToffset; 

    $newdate =~ s/^ /0/;

    #
    # Now that we have categorized it, print it in the new format
    #

    print($afield,' ',$ident,' - [',$newdate,'] "',$method,' ',$oname,
          '" ',$rcode,' ',$fsize," \n");

}
close LOG;

exit(0);