1 | # -*-perl-*-
|
---|
2 | # htmlop.pl: Do operations on html documents.
|
---|
3 | $VERSION=0.2.4;
|
---|
4 | #
|
---|
5 | # Original source from Bjørn Borud, without it I would not have atempted
|
---|
6 | # this. In this incarnation it bears no resemblance to Bjørns code.
|
---|
7 | # - Nicolai Langfeldt 18/11/95.
|
---|
8 | #
|
---|
9 | # htmlop.pl does operations on html files, possebly many at a time.
|
---|
10 | # Operations:
|
---|
11 | # - Absolitify urls
|
---|
12 | # - Relativify urls
|
---|
13 | # - Gather list of urls
|
---|
14 | # - Callback with url
|
---|
15 | # - Canonify document w.r.t. SGML.
|
---|
16 | #
|
---|
17 | # Authors:
|
---|
18 | # - Nicolai Langfeldt ([email protected])
|
---|
19 | # - Chris Szurgot ([email protected])
|
---|
20 | #
|
---|
21 | # Changes:
|
---|
22 | # janl 18/11/95 - Initial version
|
---|
23 | # szurgot 09/02/96 - Code in htmlop'process to remove <BASE> Tag if we are
|
---|
24 | # returning the form. An unchanged base destroys local
|
---|
25 | # fragments.
|
---|
26 | # janl 22/02/96 - Added URLSUB functions. <BASE> will only be removed from
|
---|
27 | # the returned doc, never from the origninal doc.
|
---|
28 | # janl 16/05/96 - Removed I* options and added single NODOC option/modifier
|
---|
29 | # to replace them.
|
---|
30 | # janl 09/09/96 - Added URLPROC and NREL opcodes, for a better URL
|
---|
31 | # processing model. -> 0.1.7
|
---|
32 | # janl 24/09/96 - Various cosmetics, no longer inserting a !SGML tag,
|
---|
33 | # only !DOCTYPE and HTML tags. -> 0.1.8
|
---|
34 | # janl 11/10/96 - URLs in HTML 3.2 tags are now found.
|
---|
35 | # janl 6/11/96 - Added netscape SCRIPT tag.
|
---|
36 | # janl 20/11/96 - 'BORDER' is different from 'BORDER=0' -> 0.1.10
|
---|
37 | # janl 06/11/96 - Added userdata argument to URLPROC.
|
---|
38 | # Added SAVEURLS and USESAVED opcodes. -> 0.1.11
|
---|
39 | # janl 11/04/97 - Changed to URI::URL and got into strict harness
|
---|
40 | # janl 13/04/97 - Fixed comment processing, extended it to processing
|
---|
41 | # directives. Bug reported by Chris Johnson. -> 0.1.12
|
---|
42 | # janl 08/04/97 - Made ISMAP into ISMAP=, bad! -> 0.1.13
|
---|
43 | # janl 23/05/97 - Treating <script> and <style> as verbatim -> 0.1.14
|
---|
44 | # janl 06/06/97 - Canonify: <!SGML tag confused it and !DOCTYPE was mangled
|
---|
45 | # -> 0.1.15
|
---|
46 | # janl 30/07/97 - Now supporting <BASE> tag.
|
---|
47 | # - Added TAGCALLBACK opcode -> 0.1.16
|
---|
48 | # janl 16/10/97 - Empty string must be quoted, reported by Bart Barenburg
|
---|
49 | # -> 0.1.17
|
---|
50 | # janl 01/12/97 - More HTML URL tags/attributes recognized (Greg Lindhorst)
|
---|
51 | # janl 13/12/97 - Can't delete the whole <BASE> tag. Netscape has
|
---|
52 | # extended it for frame use. So only delete the HREF
|
---|
53 | # attr -> 0.1.18
|
---|
54 | # janl 01/01/98 - Realized applet/object support can't work and changed
|
---|
55 | # so it's able to work, if conditions are right.
|
---|
56 | # janl 04/01/98 - Added html 4.0 tags and attributes.
|
---|
57 | # janl 08/02/98 - Hacked for speed. Went from 43s to 9s on a 170K
|
---|
58 | # document -> 0.2. Thanks to Rune Frøysa who taunted me.
|
---|
59 | # janl 09/04/98 - More tolerant about what constitutes a newline -> 0.2.1
|
---|
60 | # janl 09/05/98 - Export %isdir -> 0.2.2
|
---|
61 | # janl 13/04/99 - Remove leading /../ sequences in path component in
|
---|
62 | # ABS code. -> 0.2.3
|
---|
63 | # janl 28/05/99 - The code was buggy, now it's not. -> 0.2.4
|
---|
64 |
|
---|
65 | package htmlop;
|
---|
66 |
|
---|
67 | use URI::URL;
|
---|
68 |
|
---|
69 | use strict;
|
---|
70 | # Global variables
|
---|
71 | use vars qw($ABS $REL $LIST $CANON $URLSUB $NODOC $URLPROC $NREL);
|
---|
72 | use vars qw($SAVEURL $USESAVED $TAGCALLBACK $debug %isdir);
|
---|
73 |
|
---|
74 | # These are for the smartrel routines
|
---|
75 | my $url_origin;
|
---|
76 | my $doc_top;
|
---|
77 | my $doc_top_re;
|
---|
78 | my $choped_url_or;
|
---|
79 |
|
---|
80 | # HTML operation codes for process_html. The first argument is the
|
---|
81 | # html document to do operations on. It will not be changed. If
|
---|
82 | # NODOC is not passed a new document edited as specified will be
|
---|
83 | # returned. No more than one document will be returned by one
|
---|
84 | # invocation of htmlop.
|
---|
85 |
|
---|
86 | $ABS = 1; # Absolutify urls. Arg: Origin. The
|
---|
87 | # ABS function absolutifies URLS,
|
---|
88 | # assuming they are relative to the
|
---|
89 | # argument.
|
---|
90 | # If a <BASE> tag is found the origin
|
---|
91 | # given by it will be used instead of
|
---|
92 | # the arguemnt given.
|
---|
93 |
|
---|
94 | $REL = 2; # Relativize all urls. Arg: Origin.
|
---|
95 | # The REL function simply removes the
|
---|
96 | # argument string from any urls
|
---|
97 | # matching it. For this reason the
|
---|
98 | # origin string is interprated as a
|
---|
99 | # RE, which may have unexpected
|
---|
100 | # results, unless RE specials are
|
---|
101 | # escaped, which they should be.
|
---|
102 |
|
---|
103 | $LIST = 3; # List all urls, Ret: urls
|
---|
104 |
|
---|
105 | $CANON = 4; # See that missing opening (and
|
---|
106 | # ending) SGML tags are injected.
|
---|
107 |
|
---|
108 | $URLSUB = 5; # Do regular expression substitution
|
---|
109 | # on urls. Arg: RE, substitute.
|
---|
110 |
|
---|
111 | $NODOC = 6; # Do not return a rebuilt document.
|
---|
112 | # Saves memory, and time
|
---|
113 |
|
---|
114 | # NOTE: I SUSPECT URLPROC IS BROKEN IF A BASE TAG APPEARS IN THE TEXT.
|
---|
115 | # THE URL PROCESSOR NEEDS TO BE PASSED htmlop::process' IDEA OF WHAT THE
|
---|
116 | # BASE URL IS.
|
---|
117 |
|
---|
118 | $URLPROC = 7; # Apply function on urls (process
|
---|
119 | # urls). Arg: Pointer to function to
|
---|
120 | # apply, userdata. The function will
|
---|
121 | # be passed the url, modified by any
|
---|
122 | # previous operations and the
|
---|
123 | # userdata. The function must return
|
---|
124 | # the new url.
|
---|
125 |
|
---|
126 | $NREL = 8; # New relativisation function, works
|
---|
127 | # much better than the old one. Arg:
|
---|
128 | # Origin, Top.
|
---|
129 |
|
---|
130 | $SAVEURL = 9; # Save urls in tag with modified name.
|
---|
131 | # Arg: attribute prefix.
|
---|
132 | # Example: <a href=foo> becomes
|
---|
133 | # <a href=foo w3mir-href=foo> if no other
|
---|
134 | # processing of the url is done.
|
---|
135 |
|
---|
136 | $USESAVED = 10; # Use saved urls. Arg: attribute prefix
|
---|
137 |
|
---|
138 | $TAGCALLBACK = 11; # Procedure to call for each Tag.
|
---|
139 | # Args: procedure, userdata (one item)
|
---|
140 |
|
---|
141 | # Args to procedure: userdata, Base
|
---|
142 | # URL, tag name, reference to array of
|
---|
143 | # URL attributes, reference to hash of
|
---|
144 | # all attributes. The base url is
|
---|
145 | # derived from the one used in ABS or
|
---|
146 | # the BASE tag.
|
---|
147 |
|
---|
148 | $debug=0; # Debugging level in this package
|
---|
149 |
|
---|
150 | # process_html returns a array. The first component of the array is
|
---|
151 | # the new html document. The rest of the array is the urls. If a
|
---|
152 | # document is not to be returned a empty string is returned. If a url
|
---|
153 | # list is not to be returned a empty array is returned.
|
---|
154 |
|
---|
155 | # HERE BE DRAGONS:
|
---|
156 |
|
---|
157 | # Where to find URLs in various tags. The second compoent is a array
|
---|
158 | # reference.
|
---|
159 |
|
---|
160 | my(%urls) = (
|
---|
161 | HEAD => [ 'PROFILE' ],
|
---|
162 | BLOCKQUOTE => [ 'CITE' ],
|
---|
163 | Q => [ 'CITE' ],
|
---|
164 | INS => [ 'CITE' ],
|
---|
165 | DEL => [ 'CITE' ],
|
---|
166 | A => [ 'HREF' ] ,
|
---|
167 | IMG => [ 'SRC' ,'LOWSRC' ,'USEMAP', 'LONGDESC' ] ,
|
---|
168 | EMBED => [ 'SRC' ],
|
---|
169 | FRAME => [ 'SRC', 'LONGDESC' ],
|
---|
170 | IFRAME => [ 'SRC', 'LONGDESC' ],
|
---|
171 | BODY => [ 'BACKGROUND' ],
|
---|
172 | AREA => [ 'HREF' ],
|
---|
173 | LINK => [ 'HREF' ],
|
---|
174 |
|
---|
175 | # The APPLET and OBJECT tags do not fit into my model for URL
|
---|
176 | # manipulation. Just looking at CODEBASE might work, if the
|
---|
177 | # URL it names is a browseable directory...
|
---|
178 | APPLET => [ 'CODEBASE' ], # If the codebase dir is browseable
|
---|
179 | OBJECT => [ 'CODEBASE' ], # Ditto. Can't handle DATA attribute now
|
---|
180 |
|
---|
181 | INPUT => [ 'SRC', 'USEMAP' ],
|
---|
182 | MAP => [ 'HREF' ],
|
---|
183 | SCRIPT => [ 'SRC', 'FOR' ],# 'FOR's semantics is not defined, the
|
---|
184 | # attribute is just reserved for possible
|
---|
185 | # future use...
|
---|
186 | BGSOUND => [ 'SRC' ],
|
---|
187 | FORM => [ 'ACTION' ], # Is this asking for trouble?
|
---|
188 | # Maybe it should just be absolutized...
|
---|
189 | # On the other hand: It's CGI...
|
---|
190 | );
|
---|
191 |
|
---|
192 | my(%relative) = (
|
---|
193 | # Identify URL attributes containing urls that are relative to
|
---|
194 | # the named URL attribute. When processing these they should
|
---|
195 | # be absolitized and then relativized relative to the BASE attribute.
|
---|
196 | # This is just window dressing for now; it is not used for anything.
|
---|
197 |
|
---|
198 | # ARCHIVE is really a URI _list_.
|
---|
199 | CODEBASE => [ 'CLASSID', 'DATA', 'CODE', 'ARCHIVE' ],
|
---|
200 | );
|
---|
201 |
|
---|
202 | %isdir = (
|
---|
203 | # These tags refer to directories:
|
---|
204 | CODEBASE => 1
|
---|
205 | );
|
---|
206 |
|
---|
207 | # Tags that enclose bits we want to leave absolutely alone because they
|
---|
208 | # are not very like HTML, or some such.
|
---|
209 |
|
---|
210 | # The material between the start and end tags is copied with no
|
---|
211 | # processing at all. The end tag is left to be processed.
|
---|
212 | # The endtag match is case insensitive.
|
---|
213 | my(%verbatim) = (
|
---|
214 | SCRIPT => quotemeta('</SCRIPT>'), # Embeded scripts
|
---|
215 | STYLE => quotemeta('</STYLE>'), # Embeded stylesheet
|
---|
216 | );
|
---|
217 |
|
---|
218 | # These are the functions that pick the HTML to pieces. It will not
|
---|
219 | # work esp. good on a random SGML document since the HTML application
|
---|
220 | # of SGML has simpler quoting than it might.
|
---|
221 |
|
---|
222 | sub gettoken {
|
---|
223 | # Get one token from the argument, removing it from the argument.
|
---|
224 | # BUG: There should be whitespace at the end of the examined string.
|
---|
225 | my($c,$token,$i);
|
---|
226 |
|
---|
227 | # Skip whitespace and newlines
|
---|
228 | return '' unless defined(@_) && defined($_[0]);
|
---|
229 | $_[0] =~ s/^[\r\n\s]*//;
|
---|
230 |
|
---|
231 | return '' if ($_[0] eq '');
|
---|
232 |
|
---|
233 | $c = substr($_[0],0,1);
|
---|
234 | substr($_[0],0,1)='';
|
---|
235 |
|
---|
236 | if ($c eq '"' || $c eq "\'") { # Quoted material
|
---|
237 | $i=index($_[0],$c);
|
---|
238 | # End-quote missing, just gobble the rest of the doc
|
---|
239 | $i=length($_[0]) if $i == -1;
|
---|
240 | # Extract and remove token
|
---|
241 | $token=substr($_[0],0,$i);
|
---|
242 | substr($_[0],0,$i+1)='';
|
---|
243 | } elsif ($c eq '=') {
|
---|
244 | $token='=';
|
---|
245 | } else { # Non-quoted material, ends in whitespace or =
|
---|
246 | $_[0] =~ m/[=\s\n\r]/;
|
---|
247 | $_[0] = $&.$';
|
---|
248 | $token=$c.$`;
|
---|
249 | }
|
---|
250 | # print "Token: '$token'\t\tRest: '",$_[0],"'\n";
|
---|
251 | return $token;
|
---|
252 | }
|
---|
253 |
|
---|
254 |
|
---|
255 | sub tagtoken {
|
---|
256 | # Pick the tag to pieces (also knonw as tokens). Return an
|
---|
257 | # associative array of attributes. The attribute-names are changed
|
---|
258 | # to uppercase. The attribute-values are left as is.
|
---|
259 | my($tok,$lasttok,%tokens);
|
---|
260 | # Append a space, gettoken needs it - silly? Yes!
|
---|
261 | # Change it to test on boundrary things rather than ...?
|
---|
262 | $_[0].=' ';
|
---|
263 |
|
---|
264 | $lasttok='';
|
---|
265 | while (1) {
|
---|
266 | last if (($tok=uc &gettoken($_[0])) eq '');
|
---|
267 |
|
---|
268 | if ($tok eq '=') {
|
---|
269 | # print STDERR " -bad html-" if ($lasttok eq '');
|
---|
270 | $tokens{$lasttok}.=&gettoken($_[0]);
|
---|
271 | print STDERR "STORED: $lasttok = ",$tokens{$lasttok},"\n" if $debug;
|
---|
272 | } else {
|
---|
273 | $tokens{$tok}=undef;
|
---|
274 | $lasttok=$tok;
|
---|
275 | }
|
---|
276 | }
|
---|
277 | return %tokens;
|
---|
278 | }
|
---|
279 |
|
---|
280 |
|
---|
281 | sub gettag {
|
---|
282 | # Pick out the following things from the remaining html doc:
|
---|
283 | # Everything leading up to the first tag. The first tag, and its
|
---|
284 | # contents. Modify @_ directly to reduce number of copies of
|
---|
285 | # possebly huge documents kept in memory at once. Return the body,
|
---|
286 | # the tag name, and the attributes (associative array)
|
---|
287 | my(%attr,$tagn,$tagc,$body,$tag,$doc);
|
---|
288 |
|
---|
289 | $doc=\$_[0];
|
---|
290 |
|
---|
291 | my($start,$end,$length);
|
---|
292 |
|
---|
293 | $start=index($$doc,'<');
|
---|
294 |
|
---|
295 | if ($start<$[) {
|
---|
296 | # EOF
|
---|
297 | $body=$$doc;
|
---|
298 | $$doc='';
|
---|
299 | return ($body,'',());
|
---|
300 | }
|
---|
301 |
|
---|
302 | $end=index($$doc,'>',$start+1);
|
---|
303 |
|
---|
304 | if ($end<$[) {
|
---|
305 | # This sucks, found no end of the tag...
|
---|
306 | $body=$$doc;
|
---|
307 | $$doc='';
|
---|
308 | return ($body,'',());
|
---|
309 | }
|
---|
310 |
|
---|
311 | $length=$end-$start-1;
|
---|
312 |
|
---|
313 | $body=substr($$doc,0,$start);
|
---|
314 | $tag=substr($$doc,$start+1,$length);
|
---|
315 |
|
---|
316 | # This shortens the string in each itteration, some kind of mechanism
|
---|
317 | # to do it once in a while would speed things up further. HOWEVER, when
|
---|
318 | # I tried to code this all I got was a _nasty_ memory leak.
|
---|
319 | substr($$doc,0,$end+1)='';
|
---|
320 |
|
---|
321 | # print STDERR "------\n";
|
---|
322 |
|
---|
323 | # print STDERR "BODY: /$body/\n";
|
---|
324 | print STDERR "COMPLETE TAG: /$tag/\n" if $debug;
|
---|
325 | # print STDERR "REST: /",substr($$doc,0,20),"/\n";
|
---|
326 |
|
---|
327 | # print STDERR "------\n";
|
---|
328 |
|
---|
329 | # Examine tag contents
|
---|
330 | if ($tag =~ /^([!?]--)/ || $tag =~ /^(!\w+)/) {
|
---|
331 | # Comment or processing dicective, handle specially
|
---|
332 | $tagn=$&;
|
---|
333 | $tagc=$';
|
---|
334 | return ($body,$tagn,("$tagc",undef));
|
---|
335 | }
|
---|
336 |
|
---|
337 | # Everything else
|
---|
338 | ($tagn,$tagc) = split(/[\s\n\r]+/,$tag,2);
|
---|
339 | $tagn="\U$tagn";
|
---|
340 |
|
---|
341 | return ($body,$tagn,()) if !defined($tagc);
|
---|
342 |
|
---|
343 | return ($body,$tagn,&tagtoken($tagc));
|
---|
344 | }
|
---|
345 |
|
---|
346 |
|
---|
347 | # This is meant for general consumption:
|
---|
348 |
|
---|
349 | sub process {
|
---|
350 | # Process a html file. Into one end you put a html file. Out
|
---|
351 | # of the other end you get something dependent on the operations
|
---|
352 | # you specified.
|
---|
353 | # I cannot gobble my arguments. I need to examine them several times.
|
---|
354 |
|
---|
355 | my($arg,$doc,$i,$retdoc,$canon,$newdoc,@urllist,$Q,$cont);
|
---|
356 | my($origin,$baseurl);
|
---|
357 |
|
---|
358 | $origin=$baseurl='';
|
---|
359 |
|
---|
360 | $retdoc=$canon=0;
|
---|
361 |
|
---|
362 | # Get the document from the argument list
|
---|
363 | $doc=shift(@_);
|
---|
364 |
|
---|
365 | $retdoc=1;
|
---|
366 | $i=0;
|
---|
367 | # Argument checking
|
---|
368 | while (defined($arg=$_[$i++])) {
|
---|
369 | if (! ($arg =~ /^\d+$/)) {
|
---|
370 | print STDERR "ERROR IN HTMLOP::process:\n";
|
---|
371 | print STDERR "Args: ",join(',',@_),"\n";
|
---|
372 | print STDERR "ARG: $arg is not a opcode.\n";
|
---|
373 | exit(1);
|
---|
374 | }
|
---|
375 | if ($arg == $ABS) {
|
---|
376 | $baseurl=$origin=$_[$i++];
|
---|
377 | } elsif ($arg == $REL || $arg == $SAVEURL || $arg == $USESAVED) {
|
---|
378 | $i++; # Skip one arg
|
---|
379 | } elsif ($arg == $URLSUB || $arg == $NREL || $arg == $URLPROC ||
|
---|
380 | $arg == $TAGCALLBACK ) {
|
---|
381 | $i += 2; # Skip two args
|
---|
382 | } elsif ($arg == $LIST) {
|
---|
383 | # do nothing
|
---|
384 | } elsif ($arg == $CANON) {
|
---|
385 | $canon=1;
|
---|
386 | } elsif ($arg == $NODOC) {
|
---|
387 | $retdoc=0;
|
---|
388 | } else {
|
---|
389 | die "htmlop: Incorrect invocation of html_process\n";
|
---|
390 | }
|
---|
391 | }
|
---|
392 |
|
---|
393 | my($endhtml)=0; # Have we seen </html>?
|
---|
394 | my($SGML)=1; # 1: !DOCTYPE not seen, 2: !DOCTYPE seen, -1: it's OK
|
---|
395 |
|
---|
396 | # These are used to store the tag components
|
---|
397 | my($textpart);
|
---|
398 | my($tagname);
|
---|
399 | my(%attrval);
|
---|
400 |
|
---|
401 | my($wholetag); # The whole tag, put together again
|
---|
402 | my($moretag); # TMP storage of tag attributes
|
---|
403 | my($attr); # Looping thru attributes, misnomer.
|
---|
404 | my($RE);
|
---|
405 | my($subst);
|
---|
406 | my($prefix);
|
---|
407 | my($fun); # Function to apply to url
|
---|
408 | my($verbatim)='';
|
---|
409 | my($url_o,$path); # URL object Used in ABS to remove leading ..
|
---|
410 |
|
---|
411 | # Welcome to the machine
|
---|
412 |
|
---|
413 | $newdoc='';
|
---|
414 | while ($doc ne '') {
|
---|
415 | ($textpart,$tagname,%attrval)=&gettag($doc);
|
---|
416 |
|
---|
417 | # Process the tag
|
---|
418 | { # Need a way to get out of this, last is my friend
|
---|
419 | # 'Canonize'
|
---|
420 | if ($canon) {
|
---|
421 | $endhtml=1 if ($tagname eq '/HTML');
|
---|
422 | if ($SGML!=-1 && $canon && $tagname ne '!--' && $tagname ne '!SGML') {
|
---|
423 | $wholetag=$tagname;
|
---|
424 | $moretag=join('',keys %attrval);
|
---|
425 | $wholetag.=' '.$moretag if ($moretag ne '');
|
---|
426 | if ($SGML==1) {
|
---|
427 | $SGML=2;
|
---|
428 | if ($tagname eq '!DOCTYPE') {
|
---|
429 | $tagname=$wholetag;
|
---|
430 | %attrval=();
|
---|
431 | last;
|
---|
432 | }
|
---|
433 |
|
---|
434 | # We have no idea what DTD this doc follows so we put in
|
---|
435 | # something kinda non-comitting
|
---|
436 | $newdoc.='<!DOCTYPE HTML PUBLIC "html.dtd">'."\n";
|
---|
437 | }
|
---|
438 | if ($SGML==2) {
|
---|
439 | $SGML=-1;
|
---|
440 | if ($tagname eq 'HTML') {
|
---|
441 | $tagname=$wholetag;
|
---|
442 | %attrval=();
|
---|
443 | last;
|
---|
444 | }
|
---|
445 | $newdoc.='<HTML>'."\n";
|
---|
446 | }
|
---|
447 | } # sgml!=-1
|
---|
448 | } #canon
|
---|
449 |
|
---|
450 | # Tack on text part before I bail out, if wanted
|
---|
451 | $newdoc.=$textpart if $retdoc;
|
---|
452 |
|
---|
453 | if ($tagname eq 'BASE') {
|
---|
454 | if (exists($attrval{'HREF'})) {
|
---|
455 | if ($origin) {
|
---|
456 | $baseurl=(url($attrval{'HREF'})->abs($origin,1))->as_string;
|
---|
457 | } else {
|
---|
458 | $baseurl=$attrval{'HREF'};
|
---|
459 | }
|
---|
460 | # Get rid of the HREF attribute. Netscape 4.0 puts (other) stuff
|
---|
461 | # into BASE that is not found in the HTML 4.0 spec.
|
---|
462 | delete $attrval{'HREF'};
|
---|
463 | print STDERR "\nBase tag: $baseurl\n" if $debug;
|
---|
464 | }
|
---|
465 | }
|
---|
466 |
|
---|
467 | # URL processing
|
---|
468 | $i=0;
|
---|
469 | while (defined($arg=$_[$i])) {
|
---|
470 | if (! ($arg =~ /^\d+$/)) {
|
---|
471 | print STDERR "Args: ",join(',',@_),"\n";
|
---|
472 | print STDERR "ARG: $arg ($i) is not a opcode.\n";
|
---|
473 | exit(1);
|
---|
474 | }
|
---|
475 | $i++;
|
---|
476 | if ($arg == $ABS) {
|
---|
477 | $origin=$baseurl || $_[$i++];
|
---|
478 | $i++;
|
---|
479 | # Want it to be a URL object
|
---|
480 | $origin=url $origin unless ref $origin;
|
---|
481 | print STDERR 'ABS: ',$origin->as_string,"\n" if $debug;
|
---|
482 | next unless defined($urls{$tagname});
|
---|
483 | foreach $attr (@{$urls{$tagname}}) {
|
---|
484 | if (defined($attrval{$attr})) {
|
---|
485 | # Ugly: Remove leading /../ sequences in path component
|
---|
486 | $url_o=url($attrval{$attr})->abs($origin,1);
|
---|
487 | if (defined($path=$url_o->path)) {
|
---|
488 | # mailto: URLs does not have path components
|
---|
489 | $path =~ s~/\.\.(?=/)~~g;
|
---|
490 | $url_o->path($path);
|
---|
491 | }
|
---|
492 | $attrval{$attr}=$url_o->as_string;
|
---|
493 | }
|
---|
494 | }
|
---|
495 | } elsif ($arg == $REL) {
|
---|
496 | $origin=$_[$i++];
|
---|
497 | # Want it to be a string
|
---|
498 | $origin=$origin->as_string if ref $origin;
|
---|
499 | print STDERR 'REL: ',$origin,"\n" if $debug;
|
---|
500 | next unless defined($urls{$tagname});
|
---|
501 | foreach $attr (@{$urls{$tagname}}) {
|
---|
502 | $attrval{$attr} =~ s/^$origin//
|
---|
503 | if defined($attrval{$attr});
|
---|
504 | }
|
---|
505 | } elsif ($arg == $URLSUB) {
|
---|
506 | $RE=$_[$i++];
|
---|
507 | $subst=$_[$i++];
|
---|
508 | warn "URLSUB: $RE -> $subst\n" if $debug;
|
---|
509 | next unless defined($urls{$tagname});
|
---|
510 | foreach $attr (@{$urls{$tagname}}) {
|
---|
511 | $attrval{$attr} =~ s/$RE/$subst/
|
---|
512 | if defined($attrval{$attr});
|
---|
513 | }
|
---|
514 | } elsif ($arg == $LIST) {
|
---|
515 | warn "LIST;\n" if $debug;
|
---|
516 | next unless exists($urls{$tagname});
|
---|
517 | foreach $attr (@{$urls{$tagname}}) {
|
---|
518 | if (exists($attrval{$attr})) {
|
---|
519 | $attrval{$attr}.='/'
|
---|
520 | if $isdir{$attr} && $attrval{$attr} =~ m~[^/]$~;
|
---|
521 | push(@urllist,$attrval{$attr});
|
---|
522 | }
|
---|
523 | }
|
---|
524 | } elsif ($arg == $SAVEURL) {
|
---|
525 | warn "SAVEURL;\n" if $debug;
|
---|
526 | $prefix=$_[$i++];
|
---|
527 | next unless exists($urls{$tagname});
|
---|
528 | foreach $attr (@{$urls{$tagname}}) {
|
---|
529 | $attrval{"$prefix$attr"}=$attrval{$attr}
|
---|
530 | if defined($attrval{$attr});
|
---|
531 | }
|
---|
532 | } elsif ($arg == $USESAVED) {
|
---|
533 | warn "USESAVED;\n" if $debug;
|
---|
534 | $prefix=$_[$i++];
|
---|
535 | next unless exists($urls{$tagname});
|
---|
536 | foreach $attr (@{$urls{$tagname}}) {
|
---|
537 | $attrval{$attr}=$attrval{"$prefix$attr"}
|
---|
538 | if (defined($attrval{"$prefix$attr"}));
|
---|
539 | # bug compatability, drop sometime after w3mir 1.0
|
---|
540 | $attrval{$attr}=$attrval{"$prefix-$attr"}
|
---|
541 | if (defined($attrval{"$prefix-$attr"}));
|
---|
542 | }
|
---|
543 | } elsif ($arg == $NODOC) {
|
---|
544 | warn "NODOC;\n" if $debug;
|
---|
545 | } elsif ($arg == $CANON) {
|
---|
546 | warn "CANON;\n" if $debug;
|
---|
547 | } elsif ($arg == $URLPROC || $arg == $NREL) {
|
---|
548 | # Apply a function to all urls.
|
---|
549 | # NREL = Special case of $URLPROC, apply internal function.
|
---|
550 | if ($arg == $URLPROC) {
|
---|
551 | warn "URLPROC;\n" if $debug;
|
---|
552 | $fun=$_[$i++];
|
---|
553 | $arg=$_[$i++];
|
---|
554 | } else {
|
---|
555 | $fun=\&smartrel;
|
---|
556 | $arg=undef;
|
---|
557 | $url_origin=$_[$i++];
|
---|
558 | $doc_top=$_[$i++];
|
---|
559 | $doc_top_re=quotemeta $doc_top;
|
---|
560 | ($choped_url_or=$url_origin) =~ s/^$doc_top_re//;
|
---|
561 | warn "NREL $url_origin $doc_top\n" if $debug;
|
---|
562 | }
|
---|
563 |
|
---|
564 | next unless (defined($urls{$tagname}));
|
---|
565 |
|
---|
566 | foreach $attr (@{$urls{$tagname}}) {
|
---|
567 | $attrval{$attr}=&$fun($attrval{$attr},$arg)
|
---|
568 | if defined($attrval{$attr});
|
---|
569 | }
|
---|
570 | } elsif ($arg == $TAGCALLBACK) {
|
---|
571 | $fun=$_[$i++];
|
---|
572 | $arg=$_[$i++];
|
---|
573 | warn "TAGCALLBACK($tagname);\n" if $debug;
|
---|
574 | &$fun($arg,$baseurl,$tagname,
|
---|
575 | (defined($urls{$tagname})?($urls{$tagname}):undef),
|
---|
576 | \%attrval);
|
---|
577 | } else {
|
---|
578 | print STDERR "Internal error. opcode: $arg, i: $i, args: ",
|
---|
579 | join(',',@_),"\n";
|
---|
580 | }
|
---|
581 | }
|
---|
582 | last;
|
---|
583 | }
|
---|
584 |
|
---|
585 | # That ends URL processing
|
---|
586 |
|
---|
587 | # Was this a verbatim leadin tag? If yes, atempt to fish out text
|
---|
588 | # between here and the end tag, (minimal match) and substitute it
|
---|
589 | # with nothing. The end tag is kept. And the fished out text is
|
---|
590 | # re-inserted in the result with no changes.
|
---|
591 |
|
---|
592 | $verbatim=$1
|
---|
593 | if defined($verbatim{$tagname}) &&
|
---|
594 | ($doc =~ s/^(.*?)($verbatim{$tagname})/$2/is);
|
---|
595 |
|
---|
596 | # Tack on the tag, if wanted.
|
---|
597 | if ($retdoc) {
|
---|
598 | if ($tagname) {
|
---|
599 | $newdoc.='<'.$tagname;
|
---|
600 | foreach $attr (keys %attrval) {
|
---|
601 | $newdoc.=' '.$attr;
|
---|
602 | if (defined($cont=$attrval{$attr})) {
|
---|
603 | $Q='"';
|
---|
604 | $Q="'" if $cont =~ m/\"/;
|
---|
605 | $newdoc.='='.$Q.$cont.$Q;
|
---|
606 | }
|
---|
607 | }
|
---|
608 | $newdoc.='>'.$verbatim;
|
---|
609 | $verbatim='';
|
---|
610 | }
|
---|
611 | }
|
---|
612 | print STDERR "NEW: $newdoc\n" if $debug > 2;
|
---|
613 | }
|
---|
614 | $newdoc.="</HTML>\n" if ($canon && !$endhtml);
|
---|
615 | return ($newdoc,@urllist);
|
---|
616 | }
|
---|
617 |
|
---|
618 |
|
---|
619 | sub smartrel {
|
---|
620 | # 'smart' relativisation function, uses .. to refer to things up to
|
---|
621 | # $doc_top level. Outside that scope it stays absolute.
|
---|
622 | #
|
---|
623 | # The rel function itself is now in libwww-perl
|
---|
624 |
|
---|
625 | my($url_o)=@_;
|
---|
626 |
|
---|
627 | return if (!(defined($url_o) && $url_o));
|
---|
628 |
|
---|
629 | $url_o=url $url_o unless ref $url_o;
|
---|
630 |
|
---|
631 | # print STDERR "\nsmartrel: $url_o->as_string\n",
|
---|
632 | # " from: $url_origin\n",
|
---|
633 | # " within: $doc_top\n";
|
---|
634 |
|
---|
635 | # Check if within scope of our doc
|
---|
636 | $url_o = $url_o->rel($url_origin)
|
---|
637 | if $url_o->as_string =~ m/^$doc_top_re/;
|
---|
638 |
|
---|
639 | return $url_o->as_string;
|
---|
640 | }
|
---|
641 |
|
---|
642 |
|
---|
643 | sub smart_setup {
|
---|
644 | # Setup routine for smartrel.
|
---|
645 |
|
---|
646 | ($url_origin,$doc_top) = @_;
|
---|
647 |
|
---|
648 | $url_origin=$url_origin->as_string if ref $url_origin;
|
---|
649 | $doc_top=$doc_top->as_string if ref $doc_top;
|
---|
650 |
|
---|
651 | $doc_top_re=quotemeta $doc_top;
|
---|
652 |
|
---|
653 | ($choped_url_or=$url_origin) =~ s/^$doc_top_re//;
|
---|
654 | }
|
---|
655 |
|
---|
656 | 1;
|
---|