# -*-perl-*-
# htmlop.pl: Do operations on html documents.
$VERSION=0.2.4;
#
# Original source from Bjørn Borud, without it I would not have atempted
# this. In this incarnation it bears no resemblance to Bjørns code.
# - Nicolai Langfeldt 18/11/95.
#
# htmlop.pl does operations on html files, possebly many at a time.
# Operations:
# - Absolitify urls
# - Relativify urls
# - Gather list of urls
# - Callback with url
# - Canonify document w.r.t. SGML.
#
# Authors:
# - Nicolai Langfeldt (janl@ifi.uio.no)
# - Chris Szurgot (szurgot@itribe.net)
#
# Changes:
# janl 18/11/95 - Initial version
# szurgot 09/02/96 - Code in htmlop'process to remove Tag if we are
# returning the form. An unchanged base destroys local
# fragments.
# janl 22/02/96 - Added URLSUB functions. will only be removed from
# the returned doc, never from the origninal doc.
# janl 16/05/96 - Removed I* options and added single NODOC option/modifier
# to replace them.
# janl 09/09/96 - Added URLPROC and NREL opcodes, for a better URL
# processing model. -> 0.1.7
# janl 24/09/96 - Various cosmetics, no longer inserting a !SGML tag,
# only !DOCTYPE and HTML tags. -> 0.1.8
# janl 11/10/96 - URLs in HTML 3.2 tags are now found.
# janl 6/11/96 - Added netscape SCRIPT tag.
# janl 20/11/96 - 'BORDER' is different from 'BORDER=0' -> 0.1.10
# janl 06/11/96 - Added userdata argument to URLPROC.
# Added SAVEURLS and USESAVED opcodes. -> 0.1.11
# janl 11/04/97 - Changed to URI::URL and got into strict harness
# janl 13/04/97 - Fixed comment processing, extended it to processing
# directives. Bug reported by Chris Johnson. -> 0.1.12
# janl 08/04/97 - Made ISMAP into ISMAP=, bad! -> 0.1.13
# janl 23/05/97 - Treating '), # Embeded scripts
STYLE => quotemeta(''), # Embeded stylesheet
);
# These are the functions that pick the HTML to pieces. It will not
# work esp. good on a random SGML document since the HTML application
# of SGML has simpler quoting than it might.
sub gettoken {
# Get one token from the argument, removing it from the argument.
# BUG: There should be whitespace at the end of the examined string.
my($c,$token,$i);
# Skip whitespace and newlines
return '' unless defined(@_) && defined($_[0]);
$_[0] =~ s/^[\r\n\s]*//;
return '' if ($_[0] eq '');
$c = substr($_[0],0,1);
substr($_[0],0,1)='';
if ($c eq '"' || $c eq "\'") { # Quoted material
$i=index($_[0],$c);
# End-quote missing, just gobble the rest of the doc
$i=length($_[0]) if $i == -1;
# Extract and remove token
$token=substr($_[0],0,$i);
substr($_[0],0,$i+1)='';
} elsif ($c eq '=') {
$token='=';
} else { # Non-quoted material, ends in whitespace or =
$_[0] =~ m/[=\s\n\r]/;
$_[0] = $&.$';
$token=$c.$`;
}
# print "Token: '$token'\t\tRest: '",$_[0],"'\n";
return $token;
}
sub tagtoken {
# Pick the tag to pieces (also knonw as tokens). Return an
# associative array of attributes. The attribute-names are changed
# to uppercase. The attribute-values are left as is.
my($tok,$lasttok,%tokens);
# Append a space, gettoken needs it - silly? Yes!
# Change it to test on boundrary things rather than ...?
$_[0].=' ';
$lasttok='';
while (1) {
last if (($tok=uc &gettoken($_[0])) eq '');
if ($tok eq '=') {
# print STDERR " -bad html-" if ($lasttok eq '');
$tokens{$lasttok}.=&gettoken($_[0]);
print STDERR "STORED: $lasttok = ",$tokens{$lasttok},"\n" if $debug;
} else {
$tokens{$tok}=undef;
$lasttok=$tok;
}
}
return %tokens;
}
sub gettag {
# Pick out the following things from the remaining html doc:
# Everything leading up to the first tag. The first tag, and its
# contents. Modify @_ directly to reduce number of copies of
# possebly huge documents kept in memory at once. Return the body,
# the tag name, and the attributes (associative array)
my(%attr,$tagn,$tagc,$body,$tag,$doc);
$doc=\$_[0];
my($start,$end,$length);
$start=index($$doc,'<');
if ($start<$[) {
# EOF
$body=$$doc;
$$doc='';
return ($body,'',());
}
$end=index($$doc,'>',$start+1);
if ($end<$[) {
# This sucks, found no end of the tag...
$body=$$doc;
$$doc='';
return ($body,'',());
}
$length=$end-$start-1;
$body=substr($$doc,0,$start);
$tag=substr($$doc,$start+1,$length);
# This shortens the string in each itteration, some kind of mechanism
# to do it once in a while would speed things up further. HOWEVER, when
# I tried to code this all I got was a _nasty_ memory leak.
substr($$doc,0,$end+1)='';
# print STDERR "------\n";
# print STDERR "BODY: /$body/\n";
print STDERR "COMPLETE TAG: /$tag/\n" if $debug;
# print STDERR "REST: /",substr($$doc,0,20),"/\n";
# print STDERR "------\n";
# Examine tag contents
if ($tag =~ /^([!?]--)/ || $tag =~ /^(!\w+)/) {
# Comment or processing dicective, handle specially
$tagn=$&;
$tagc=$';
return ($body,$tagn,("$tagc",undef));
}
# Everything else
($tagn,$tagc) = split(/[\s\n\r]+/,$tag,2);
$tagn="\U$tagn";
return ($body,$tagn,()) if !defined($tagc);
return ($body,$tagn,&tagtoken($tagc));
}
# This is meant for general consumption:
sub process {
# Process a html file. Into one end you put a html file. Out
# of the other end you get something dependent on the operations
# you specified.
# I cannot gobble my arguments. I need to examine them several times.
my($arg,$doc,$i,$retdoc,$canon,$newdoc,@urllist,$Q,$cont);
my($origin,$baseurl);
$origin=$baseurl='';
$retdoc=$canon=0;
# Get the document from the argument list
$doc=shift(@_);
$retdoc=1;
$i=0;
# Argument checking
while (defined($arg=$_[$i++])) {
if (! ($arg =~ /^\d+$/)) {
print STDERR "ERROR IN HTMLOP::process:\n";
print STDERR "Args: ",join(',',@_),"\n";
print STDERR "ARG: $arg is not a opcode.\n";
exit(1);
}
if ($arg == $ABS) {
$baseurl=$origin=$_[$i++];
} elsif ($arg == $REL || $arg == $SAVEURL || $arg == $USESAVED) {
$i++; # Skip one arg
} elsif ($arg == $URLSUB || $arg == $NREL || $arg == $URLPROC ||
$arg == $TAGCALLBACK ) {
$i += 2; # Skip two args
} elsif ($arg == $LIST) {
# do nothing
} elsif ($arg == $CANON) {
$canon=1;
} elsif ($arg == $NODOC) {
$retdoc=0;
} else {
die "htmlop: Incorrect invocation of html_process\n";
}
}
my($endhtml)=0; # Have we seen