# -*-perl-*- # htmlop.pl: Do operations on html documents. $VERSION=0.2.4; # # Original source from Bjørn Borud, without it I would not have atempted # this. In this incarnation it bears no resemblance to Bjørns code. # - Nicolai Langfeldt 18/11/95. # # htmlop.pl does operations on html files, possebly many at a time. # Operations: # - Absolitify urls # - Relativify urls # - Gather list of urls # - Callback with url # - Canonify document w.r.t. SGML. # # Authors: # - Nicolai Langfeldt (janl@ifi.uio.no) # - Chris Szurgot (szurgot@itribe.net) # # Changes: # janl 18/11/95 - Initial version # szurgot 09/02/96 - Code in htmlop'process to remove Tag if we are # returning the form. An unchanged base destroys local # fragments. # janl 22/02/96 - Added URLSUB functions. will only be removed from # the returned doc, never from the origninal doc. # janl 16/05/96 - Removed I* options and added single NODOC option/modifier # to replace them. # janl 09/09/96 - Added URLPROC and NREL opcodes, for a better URL # processing model. -> 0.1.7 # janl 24/09/96 - Various cosmetics, no longer inserting a !SGML tag, # only !DOCTYPE and HTML tags. -> 0.1.8 # janl 11/10/96 - URLs in HTML 3.2 tags are now found. # janl 6/11/96 - Added netscape SCRIPT tag. # janl 20/11/96 - 'BORDER' is different from 'BORDER=0' -> 0.1.10 # janl 06/11/96 - Added userdata argument to URLPROC. # Added SAVEURLS and USESAVED opcodes. -> 0.1.11 # janl 11/04/97 - Changed to URI::URL and got into strict harness # janl 13/04/97 - Fixed comment processing, extended it to processing # directives. Bug reported by Chris Johnson. -> 0.1.12 # janl 08/04/97 - Made ISMAP into ISMAP=, bad! -> 0.1.13 # janl 23/05/97 - Treating '), # Embeded scripts STYLE => quotemeta(''), # Embeded stylesheet ); # These are the functions that pick the HTML to pieces. It will not # work esp. good on a random SGML document since the HTML application # of SGML has simpler quoting than it might. sub gettoken { # Get one token from the argument, removing it from the argument. # BUG: There should be whitespace at the end of the examined string. my($c,$token,$i); # Skip whitespace and newlines return '' unless defined(@_) && defined($_[0]); $_[0] =~ s/^[\r\n\s]*//; return '' if ($_[0] eq ''); $c = substr($_[0],0,1); substr($_[0],0,1)=''; if ($c eq '"' || $c eq "\'") { # Quoted material $i=index($_[0],$c); # End-quote missing, just gobble the rest of the doc $i=length($_[0]) if $i == -1; # Extract and remove token $token=substr($_[0],0,$i); substr($_[0],0,$i+1)=''; } elsif ($c eq '=') { $token='='; } else { # Non-quoted material, ends in whitespace or = $_[0] =~ m/[=\s\n\r]/; $_[0] = $&.$'; $token=$c.$`; } # print "Token: '$token'\t\tRest: '",$_[0],"'\n"; return $token; } sub tagtoken { # Pick the tag to pieces (also knonw as tokens). Return an # associative array of attributes. The attribute-names are changed # to uppercase. The attribute-values are left as is. my($tok,$lasttok,%tokens); # Append a space, gettoken needs it - silly? Yes! # Change it to test on boundrary things rather than ...? $_[0].=' '; $lasttok=''; while (1) { last if (($tok=uc &gettoken($_[0])) eq ''); if ($tok eq '=') { # print STDERR " -bad html-" if ($lasttok eq ''); $tokens{$lasttok}.=&gettoken($_[0]); print STDERR "STORED: $lasttok = ",$tokens{$lasttok},"\n" if $debug; } else { $tokens{$tok}=undef; $lasttok=$tok; } } return %tokens; } sub gettag { # Pick out the following things from the remaining html doc: # Everything leading up to the first tag. The first tag, and its # contents. Modify @_ directly to reduce number of copies of # possebly huge documents kept in memory at once. Return the body, # the tag name, and the attributes (associative array) my(%attr,$tagn,$tagc,$body,$tag,$doc); $doc=\$_[0]; my($start,$end,$length); $start=index($$doc,'<'); if ($start<$[) { # EOF $body=$$doc; $$doc=''; return ($body,'',()); } $end=index($$doc,'>',$start+1); if ($end<$[) { # This sucks, found no end of the tag... $body=$$doc; $$doc=''; return ($body,'',()); } $length=$end-$start-1; $body=substr($$doc,0,$start); $tag=substr($$doc,$start+1,$length); # This shortens the string in each itteration, some kind of mechanism # to do it once in a while would speed things up further. HOWEVER, when # I tried to code this all I got was a _nasty_ memory leak. substr($$doc,0,$end+1)=''; # print STDERR "------\n"; # print STDERR "BODY: /$body/\n"; print STDERR "COMPLETE TAG: /$tag/\n" if $debug; # print STDERR "REST: /",substr($$doc,0,20),"/\n"; # print STDERR "------\n"; # Examine tag contents if ($tag =~ /^([!?]--)/ || $tag =~ /^(!\w+)/) { # Comment or processing dicective, handle specially $tagn=$&; $tagc=$'; return ($body,$tagn,("$tagc",undef)); } # Everything else ($tagn,$tagc) = split(/[\s\n\r]+/,$tag,2); $tagn="\U$tagn"; return ($body,$tagn,()) if !defined($tagc); return ($body,$tagn,&tagtoken($tagc)); } # This is meant for general consumption: sub process { # Process a html file. Into one end you put a html file. Out # of the other end you get something dependent on the operations # you specified. # I cannot gobble my arguments. I need to examine them several times. my($arg,$doc,$i,$retdoc,$canon,$newdoc,@urllist,$Q,$cont); my($origin,$baseurl); $origin=$baseurl=''; $retdoc=$canon=0; # Get the document from the argument list $doc=shift(@_); $retdoc=1; $i=0; # Argument checking while (defined($arg=$_[$i++])) { if (! ($arg =~ /^\d+$/)) { print STDERR "ERROR IN HTMLOP::process:\n"; print STDERR "Args: ",join(',',@_),"\n"; print STDERR "ARG: $arg is not a opcode.\n"; exit(1); } if ($arg == $ABS) { $baseurl=$origin=$_[$i++]; } elsif ($arg == $REL || $arg == $SAVEURL || $arg == $USESAVED) { $i++; # Skip one arg } elsif ($arg == $URLSUB || $arg == $NREL || $arg == $URLPROC || $arg == $TAGCALLBACK ) { $i += 2; # Skip two args } elsif ($arg == $LIST) { # do nothing } elsif ($arg == $CANON) { $canon=1; } elsif ($arg == $NODOC) { $retdoc=0; } else { die "htmlop: Incorrect invocation of html_process\n"; } } my($endhtml)=0; # Have we seen ? my($SGML)=1; # 1: !DOCTYPE not seen, 2: !DOCTYPE seen, -1: it's OK # These are used to store the tag components my($textpart); my($tagname); my(%attrval); my($wholetag); # The whole tag, put together again my($moretag); # TMP storage of tag attributes my($attr); # Looping thru attributes, misnomer. my($RE); my($subst); my($prefix); my($fun); # Function to apply to url my($verbatim)=''; my($url_o,$path); # URL object Used in ABS to remove leading .. # Welcome to the machine $newdoc=''; while ($doc ne '') { ($textpart,$tagname,%attrval)=&gettag($doc); # Process the tag { # Need a way to get out of this, last is my friend # 'Canonize' if ($canon) { $endhtml=1 if ($tagname eq '/HTML'); if ($SGML!=-1 && $canon && $tagname ne '!--' && $tagname ne '!SGML') { $wholetag=$tagname; $moretag=join('',keys %attrval); $wholetag.=' '.$moretag if ($moretag ne ''); if ($SGML==1) { $SGML=2; if ($tagname eq '!DOCTYPE') { $tagname=$wholetag; %attrval=(); last; } # We have no idea what DTD this doc follows so we put in # something kinda non-comitting $newdoc.=''."\n"; } if ($SGML==2) { $SGML=-1; if ($tagname eq 'HTML') { $tagname=$wholetag; %attrval=(); last; } $newdoc.=''."\n"; } } # sgml!=-1 } #canon # Tack on text part before I bail out, if wanted $newdoc.=$textpart if $retdoc; if ($tagname eq 'BASE') { if (exists($attrval{'HREF'})) { if ($origin) { $baseurl=(url($attrval{'HREF'})->abs($origin,1))->as_string; } else { $baseurl=$attrval{'HREF'}; } # Get rid of the HREF attribute. Netscape 4.0 puts (other) stuff # into BASE that is not found in the HTML 4.0 spec. delete $attrval{'HREF'}; print STDERR "\nBase tag: $baseurl\n" if $debug; } } # URL processing $i=0; while (defined($arg=$_[$i])) { if (! ($arg =~ /^\d+$/)) { print STDERR "Args: ",join(',',@_),"\n"; print STDERR "ARG: $arg ($i) is not a opcode.\n"; exit(1); } $i++; if ($arg == $ABS) { $origin=$baseurl || $_[$i++]; $i++; # Want it to be a URL object $origin=url $origin unless ref $origin; print STDERR 'ABS: ',$origin->as_string,"\n" if $debug; next unless defined($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { if (defined($attrval{$attr})) { # Ugly: Remove leading /../ sequences in path component $url_o=url($attrval{$attr})->abs($origin,1); if (defined($path=$url_o->path)) { # mailto: URLs does not have path components $path =~ s~/\.\.(?=/)~~g; $url_o->path($path); } $attrval{$attr}=$url_o->as_string; } } } elsif ($arg == $REL) { $origin=$_[$i++]; # Want it to be a string $origin=$origin->as_string if ref $origin; print STDERR 'REL: ',$origin,"\n" if $debug; next unless defined($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { $attrval{$attr} =~ s/^$origin// if defined($attrval{$attr}); } } elsif ($arg == $URLSUB) { $RE=$_[$i++]; $subst=$_[$i++]; warn "URLSUB: $RE -> $subst\n" if $debug; next unless defined($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { $attrval{$attr} =~ s/$RE/$subst/ if defined($attrval{$attr}); } } elsif ($arg == $LIST) { warn "LIST;\n" if $debug; next unless exists($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { if (exists($attrval{$attr})) { $attrval{$attr}.='/' if $isdir{$attr} && $attrval{$attr} =~ m~[^/]$~; push(@urllist,$attrval{$attr}); } } } elsif ($arg == $SAVEURL) { warn "SAVEURL;\n" if $debug; $prefix=$_[$i++]; next unless exists($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { $attrval{"$prefix$attr"}=$attrval{$attr} if defined($attrval{$attr}); } } elsif ($arg == $USESAVED) { warn "USESAVED;\n" if $debug; $prefix=$_[$i++]; next unless exists($urls{$tagname}); foreach $attr (@{$urls{$tagname}}) { $attrval{$attr}=$attrval{"$prefix$attr"} if (defined($attrval{"$prefix$attr"})); # bug compatability, drop sometime after w3mir 1.0 $attrval{$attr}=$attrval{"$prefix-$attr"} if (defined($attrval{"$prefix-$attr"})); } } elsif ($arg == $NODOC) { warn "NODOC;\n" if $debug; } elsif ($arg == $CANON) { warn "CANON;\n" if $debug; } elsif ($arg == $URLPROC || $arg == $NREL) { # Apply a function to all urls. # NREL = Special case of $URLPROC, apply internal function. if ($arg == $URLPROC) { warn "URLPROC;\n" if $debug; $fun=$_[$i++]; $arg=$_[$i++]; } else { $fun=\&smartrel; $arg=undef; $url_origin=$_[$i++]; $doc_top=$_[$i++]; $doc_top_re=quotemeta $doc_top; ($choped_url_or=$url_origin) =~ s/^$doc_top_re//; warn "NREL $url_origin $doc_top\n" if $debug; } next unless (defined($urls{$tagname})); foreach $attr (@{$urls{$tagname}}) { $attrval{$attr}=&$fun($attrval{$attr},$arg) if defined($attrval{$attr}); } } elsif ($arg == $TAGCALLBACK) { $fun=$_[$i++]; $arg=$_[$i++]; warn "TAGCALLBACK($tagname);\n" if $debug; &$fun($arg,$baseurl,$tagname, (defined($urls{$tagname})?($urls{$tagname}):undef), \%attrval); } else { print STDERR "Internal error. opcode: $arg, i: $i, args: ", join(',',@_),"\n"; } } last; } # That ends URL processing # Was this a verbatim leadin tag? If yes, atempt to fish out text # between here and the end tag, (minimal match) and substitute it # with nothing. The end tag is kept. And the fished out text is # re-inserted in the result with no changes. $verbatim=$1 if defined($verbatim{$tagname}) && ($doc =~ s/^(.*?)($verbatim{$tagname})/$2/is); # Tack on the tag, if wanted. if ($retdoc) { if ($tagname) { $newdoc.='<'.$tagname; foreach $attr (keys %attrval) { $newdoc.=' '.$attr; if (defined($cont=$attrval{$attr})) { $Q='"'; $Q="'" if $cont =~ m/\"/; $newdoc.='='.$Q.$cont.$Q; } } $newdoc.='>'.$verbatim; $verbatim=''; } } print STDERR "NEW: $newdoc\n" if $debug > 2; } $newdoc.="\n" if ($canon && !$endhtml); return ($newdoc,@urllist); } sub smartrel { # 'smart' relativisation function, uses .. to refer to things up to # $doc_top level. Outside that scope it stays absolute. # # The rel function itself is now in libwww-perl my($url_o)=@_; return if (!(defined($url_o) && $url_o)); $url_o=url $url_o unless ref $url_o; # print STDERR "\nsmartrel: $url_o->as_string\n", # " from: $url_origin\n", # " within: $doc_top\n"; # Check if within scope of our doc $url_o = $url_o->rel($url_origin) if $url_o->as_string =~ m/^$doc_top_re/; return $url_o->as_string; } sub smart_setup { # Setup routine for smartrel. ($url_origin,$doc_top) = @_; $url_origin=$url_origin->as_string if ref $url_origin; $doc_top=$doc_top->as_string if ref $doc_top; $doc_top_re=quotemeta $doc_top; ($choped_url_or=$url_origin) =~ s/^$doc_top_re//; } 1;