source: gsdl/tags/gsdl-2_71-distribution/gsdl/packages/w3mir/w3mir-1.0.8/w3mfix.PL@ 14121

Last change on this file since 14121 was 719, checked in by davidb, 25 years ago

added w3mir package

  • Property svn:keywords set to Author Date Id Revision
File size: 27.2 KB
Line 
1# -*-perl-*-
2
3use Config;
4
5&read_makefile;
6$fullperl = resolve_make_var('FULLPERL') || $Config{'perlpath'};
7$islib = resolve_make_var('INSTALLSITELIB');
8
9$name = $0;
10$name =~ s~^.*/~~;
11$name =~ s~.PL$~~;
12
13open(OUT,"> $name") ||
14 die "Could open $name for writing: $!\n";
15
16print "writing $name\n";
17
18while (<DATA>) {
19 if (m~^\#!/.*/perl.*$~o) {
20 # This substitutes the path perl was installed at on this system
21 # _and_ removed any (-w) options.
22 print OUT "#!",$fullperl,$1,"\n";
23 next;
24 }
25 if (/^use lib/o) {
26 # This substitutes the actuall library install path
27 print OUT "use lib '$islib';\n";
28 next;
29 }
30 print OUT;
31}
32
33close(OUT);
34
35# Make it executable too, and writeable
36chmod 0755, $name;
37
38#### The library
39
40sub resolve_make_var ($) {
41
42 my($var) = shift @_;
43 my($val) = $make{$var};
44
45# print "Resolving: ",$var,"=",$val,"\n";
46
47 while ($val =~ s~\$\((\S+)\)~$make{$1}~g) {}
48# print "Resolved: $var: $make{$var} -> $val\n";
49 $val;
50}
51
52
53sub read_makefile {
54
55 open(MAKEFILE, 'Makefile') ||
56 die "Could not open Makefile for reading: $!\n";
57
58 while (<MAKEFILE>) {
59 chomp;
60 next unless m/^([A-Z]+)\s*=\s*(\S+)$/;
61 $make{$1}=$2;
62# print "Makevar: $1 = $2\n";
63 }
64
65 close(MAKEFILE)
66}
67
68__END__
69#!/local/bin/perl5 -w
70# Perl 5.002 or later. w3mir is mostly tested with perl 5.004
71#
72# You might want to change or comment out this:
73use lib '/hom/janl/lib/perl';
74#
75# Can perform the following fixes:
76# - Rewrite redirected to URLs
77# - change .../ into .../index.html (or .../Welcome.html)
78# - change external links to point to some helpfull .html file ---- NOT IMPLEMENTED
79# - change links to documents not retrived to point to some helpfull
80# .html file --- NOT IMPLEMENTED
81# - After adding a 'Also' directive we can edit the urls pointing to within
82# the new space of the retrival scope, making the pointers consistent.
83
84# Method:
85# 1 Gather list of all URLs to be rewritten:
86# - Redirects: Just read the .redirs file
87# - .../ into .../index.html: All can be found in the .referers file
88# Remember to rewrite missing / redirects too...
89# - external links: in the .referers file too
90# - non-retrived documents: What files should be here (according to
91# .referers file) but are not?
92# 2 Gather list of all documents needing editing
93# 3 Edit them
94#
95# 09/05/98 janl - Only rewrite .../ to .../$indexname if the attribute
96# does not refer to a directory. -> 0.6.2
97# 12/05/98 janl - Use ->local_path to determine filename -> 0.6.3
98# 22/05/98 janl - .../#foo was not being rewritten to
99# .../$indexname#foo -> 0.6.4
100#
101require 5.002;
102
103use vars qw($win32);
104
105# To figure out what kind of system this is
106BEGIN {
107 use Config;
108 $win32 = ( $Config{'osname'} eq 'MSWin32' );
109}
110
111use Carp;
112use htmlop;
113use URI::URL;
114use URI::Escape;
115
116use strict;
117
118my $VERSION;
119$VERSION='0.6.5';
120
121my $debug=0; # Not debugging
122my $verbose=0;
123
124my $indexname='index.html';
125my $chdirto=''; # Place to chdir to after reading
126 # config file
127
128my $infoloss=0; # 1 if any URL translations (which
129 # cause information loss) are in
130 # effect. If this is true we use the
131 # SAVEURL operation. What to get, and
132 # not. Text of user supplied
133 # fetch/ignore rules
134
135my $doindex=1; # append $indexname to /$ ?
136
137my $editthis=''; # Edit references matching this expression.
138
139my $files=0; # How many files have I edited?
140my $rc='';
141
142my $rule_text="# User defined fetch/ignore rules\n";
143# Code ref to the rule procedure
144my $rule_code;
145
146# Code to prefix and postfix the generated code. Prefix should make
147# $_ contain the url to match. Postfix should return 1, the default
148# is to get the url/file.
149my $rule_prefix='$rule_code = sub { local($_) = shift;'."\n";
150my $rule_postfix=' return 1; } ';
151
152# Scope tests generated by URL/Also directives in cfg. The scope code
153# is just like the rule code, but used for program generated
154# fetch/ignore rules related to multiscope retrival.
155my $scope_fetch="# Automatic fetch rules for multiscope retrival\n";
156my $scope_ignore="# Automatic ignore rules for multiscope retrival\n";
157my $scope_code;
158
159my $scope_prefix='$scope_code = sub { local($_) = shift;'."\n";
160my $scope_postfix=' return 0; } ';
161
162# Function to apply to urls, se rule comments.
163my $user_apply_code; # User specified apply code
164my $apply_code; # w3mirs apply code
165my $apply_prefix='$apply_code = sub { local($_) = @_;'."\n";
166my $apply_lc=' $_ = lc $_; ';
167my $apply_postfix=' return $_; } ';
168my @user_apply; # List of users apply rules.
169my @internal_apply; # List of w3mirs apply rules.
170
171
172my $iinline=''; # inline RE code to make RE caseinsensitive
173my $ipost=''; # RE postfix to make it caseinsensitive
174my $lc=0; # Convert urls/filenames to lowercase?
175my $abs=0; # Absolutify URLs?
176my $fixrc=''; # Name of w3mfix config file
177my $fixup=0; # Do things needed to run fixup
178my $r=0; # Recurse? no recursion = absolutify links
179my %rum_referers=(); # Array of referers, key: rum_url
180my %rum_redirected=(); # Array of redirected url: key: original url
181my %lf_edited=(); # Edited this file yet?
182my $list; # List url on STDOUT?
183
184my %stat=(); # stat($lf_url): 'd' for dir, 'f' for others
185
186# ######################### Libwww-perl addons:
187
188sub URI::URL::_generic::unix_path {
189 my $self = (shift)->clone;
190
191 $self->frag(undef);
192 $self->scheme("file");
193 $self->unix_path;
194}
195
196
197sub URI::URL::_generic::basename {
198 my $self = shift;
199 my @p = $self->path_components;
200 my $old = $p[-1];
201 if (@_) {
202 splice(@p, -1, 1, shift);
203 $self->path_components(@p)
204 }
205 $old;
206}
207
208
209sub URI::URL::_generic::path_components {
210 my $self = shift;
211 my $old = $self->{'path'};
212 $old = '' unless defined $old;
213 $old = "/$old" if $old !~ m|^/| && defined $self->{'netloc'};
214 if (@_) {
215 $self->_elem('path',
216 join("/", map { uri_escape($_,$URI::URL::reserved)
217 } @_));
218 }
219 map { uri_unescape($_) } split("/", $old, -1);
220}
221
222
223# ######################### Configuration/argument parsing
224
225sub parse_args {
226 my $f;
227 my $i;
228
229 $i=0;
230
231 while ($f=shift) {
232 $i++;
233 # This is a demonstration against Getopts::Long.
234 if ($f =~ s/^-+//) {
235 $verbose=-1,next if $f eq 'q'; # Quiet
236 $verbose=1,next if $f eq 'c'; # Chatty
237 die "w3mfix version $VERSION\n" if $f eq 'v'; # Version
238 die "rtfm\n" if ($f eq 'help' || $f eq 'h' || $f eq '?');
239
240 if ($f eq 'editref') {
241 die "Sorry, can only have one -editref pr. run\n"
242 if $editthis;
243 $editthis=quotemeta(shift);
244 next;
245 }
246
247 if ($f eq 'd') { # Debugging level
248 $f=shift;
249 unless (($debug = $f) > 0) {
250 die "w3mfix: debug level must be a number greater than zero.\n";
251 }
252 next;
253 }
254
255 # Those were all the options...
256 warn "w3mfix: Unknown option: -$f. Use -h for usage info.\n";
257 exit(1);
258 } else {
259 # If we get this far then ... it's a configuration file name:
260 $rc = $f;
261 die "w3mfix: Got a non-option argument that wasn't the name of a\n".
262 "(configuration) file either\n"
263 unless -f $f;
264 }
265 }
266}
267
268
269sub parse_cfg_file {
270 # Read the configuration file. Aborts on errors.
271 # Ignores w3mir options w3mfix does not need itself.
272
273 my ( $file ) = @_ ;
274 my ($key, $value, $authserver,$authrealm,$authuser,$authpasswd);
275 my $i;
276
277 die "w3mfix: config file $file is not a file.\n" unless -f $file;
278 open(CFGF, $file) || die "Could not open config file $file: $!\n";
279
280 # print STDERR "Reading $file\n";
281
282 $i=0;
283
284 while (<CFGF>) {
285 # Trim off various junk
286 chomp;
287 s/^#.*//;
288 s/^\s+|\s$//g;
289 # Anything left?
290 next if $_ eq '';
291 # Examine remains
292 $i++;
293 ($key, $value) = split(/\s*:\s*/,$_,2);
294 $key = lc $key;
295
296 # These are no-ops in w3mfix
297 next if ( $key eq 'initial-referer' );
298 next if ( $key eq 'header' );
299 next if ( $key eq 'pause' );
300 next if ( $key eq 'retry-pause' );
301 next if ( $key eq 'retries' );
302 next if ( $key eq 'robot-rules' );
303 next if ( $key eq 'remove-nomirror' );
304 next if ( $key eq 'file-disposition' );
305 next if ( $key eq 'http-proxy' );
306 next if ( $key eq 'proxy-options' );
307 next if ( $key eq 'auth-domain' );
308 next if ( $key eq 'auth-user' );
309 next if ( $key eq 'auth-passwd' );
310 next if ( $key eq 'disable-headers' );
311 next if ( $key eq 'agent' );
312
313 $debug=numeric($value),next if ( $key eq 'debug' );
314 umask(numeric($value)),next if ( $key eq 'umask' );
315 $indexname=$value,next if ($key eq 'index-name');
316 $verbose=nway($value,'quiet','brief','chatty')-1,next
317 if ( $key eq 'verbosity' );
318
319 if ( $key eq 'cd' ) {
320 $chdirto=$value;
321 next;
322 }
323
324 if ($key eq 'url') {
325 my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase);
326
327 # A two argument URL: line?
328 if ($value =~ m/^(.+)\s+(.+)/i) {
329 # Two arguments.
330 $rum_url_o=url $1;
331 # The last is a directory, it must end in /
332 $lf_dir=$2;
333 $lf_dir.='/' unless $lf_dir =~ m~/$~;
334
335 # The first is a URL, make it more canonical, find the base.
336 # The namespace confusion in this section is correct.(??)
337 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
338
339 # print "URL: ",$rum_url_o->as_string,"\n";
340 # print "Base: $rum_rebase\n";
341
342 # Translate from rum space to lf space:
343 push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");
344
345 # That translation could lead to information loss.
346 $infoloss=1;
347
348 # Fetch rules tests the rum_url_o->as_string. Fetch whatever
349 # matches the base.
350 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
351
352 # Ignore whatever did not match the base.
353 $scope_ignore.="return 0 if m/^".
354 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
355
356 } else {
357 # $rum_url_o=root_quene($value);
358
359 $rum_url_o=url $value;
360
361 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
362
363 # Translate from rum space to lf space:
364 push(@internal_apply,"s/^".$rum_rebase."//");
365
366 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
367 $scope_ignore.="return 0 if m/^".
368 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
369 }
370 next;
371 }
372
373 if ($key eq 'also' || $key eq 'also-quene') {
374 if ($value =~ m/^(.+)\s+(.+)/i) {
375 my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase);
376 # Two arguments.
377 $rum_url_o=url $1;
378 $rum_url_o->host(lc $rum_url_o->host);
379 # The last is a directory, it must end in /
380 $lf_dir=$2;
381 $lf_dir.='/' unless $lf_dir =~ m~/$~;
382
383 # The first is a URL, find the base
384 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
385
386 # Ok, now we can transform and select stuff the right way
387 push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");
388 $infoloss=1;
389
390 # Fetch rules tests the rum_url_o->as_string. Fetch whatever
391 # matches the base.
392 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
393
394 # Ignore whatever did not match the base. This cures problem
395 # with '..' from base in in rum space pointing within the the
396 # scope in ra space. We introduced a extra level (or more) of
397 # directories with the apply above. Must do same with 'Also:'
398 # directives.
399 $scope_ignore.="return 0 if m/^".
400 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
401 } else {
402 die "Also: requires 2 arguments\n";
403 }
404 next;
405 }
406
407 if ($key eq 'quene') {
408 # root_quene($value);
409 next;
410 }
411
412 if ($key eq 'ignore-re' || $key eq 'fetch-re') {
413 # Check that it's a re, better that I am strict than for perl to
414 # make compilation errors.
415 unless ($value =~ /^m(.).*\1[gimosx]*$/) {
416 print STDERR "w3mfix: $value is not a recognized regular expression\n";
417 exit 1;
418 }
419 }
420
421 if ($key eq 'fetch' || $key eq 'fetch-re') {
422 my $expr=$value;
423 $expr = wild_re($expr).$ipost if ($key eq 'fetch');
424 $rule_text.=' return 1 if '.$expr.";\n";
425 next;
426 }
427
428 if ($key eq 'ignore' || $key eq 'ignore-re') {
429 my $expr=$value;
430 $expr = wild_re($expr).$ipost if ($key eq 'ignore');
431 $rule_text.=' return 0 if '.$expr.";\n";
432 next;
433 }
434
435
436 if ($key eq 'apply') {
437 unless ($value =~ /^s(.).*\1.*\1[gimosxe]*$/) {
438 print STDERR
439 "w3mfix: '$value' is not a recognized regular expression\n";
440 exit 1;
441 }
442 push(@user_apply,$value) ;
443 $infoloss=1;
444 next;
445 }
446
447 if ($key eq 'options') {
448
449 my($val,$nval);
450 foreach $val (split(/\s*,\s*/,lc $value)) {
451 if ($i==1) {
452 $nval=nway($val,'recurse','no-date-check','only-nonexistent',
453 'list-urls','lowercase','remove','batch','read-urls',
454 'abs');
455 $r=1,next if $nval==0;
456 next if $nval==1;
457 next if $nval==2;
458 $list=1,next if $nval==3;
459 if ($nval==4) {
460 $lc=1;
461 $iinline=($lc?"(?i)":"");
462 $ipost=($lc?"i":"");
463 next ;
464 }
465 next if $nval==5;
466 next if $nval==6;
467 next if $nval==7;
468 $abs=1,next if $nval==8;
469 } else {
470 die "w3mfix: options must be the first directive in the config file.\n";
471 }
472 }
473 }
474
475 if ($key eq 'fixup') {
476
477# chomp($fixrc=`pwd` || '.');
478# $fixrc.="/$file";
479# warn "Fixrc: $fixrc\n";
480# $fixup=1;
481
482 my($val,$nval);
483 foreach $val (split(/\s*,\s*/,lc $value)) {
484 $nval=nway($val,'on','run','noindex');
485 next if $nval==0;
486 next if $nval==1;
487 $doindex=0 if $nval==2;
488 # Ignore everyting else
489 }
490 }
491
492 }
493 close(CFGF);
494}
495
496sub wild_re {
497 # Here we translate unix wildcard subset to to perlre
498 local($_) = shift;
499 s#\*#\.\*#;
500 s#\?#\.#;
501 s#([\/\(\)\\\|\{\}\+)\$\^])#\\$1#g;
502 return $_ = '/'.$_.'/';
503}
504
505
506sub numeric {
507 # Check if argument is numeric?
508 my ( $number ) = @_ ;
509 return oct($number) if ($number =~ /\d+/ || $number =~ /\d+.\d+/);
510 die "Expected a number, got \"$number\"\n";
511}
512
513
514sub boolean {
515 my ( $boolean ) = @_ ;
516
517 $boolean = lc $boolean;
518
519 return 0 if ($boolean eq 'false' || $boolean eq 'off' || $boolean eq '0');
520 return 1 if ($boolean eq 'true' || $boolean eq 'on' || $boolean eq '1');
521 die "Expected a boolean, got \"$boolean\"\n";
522}
523
524
525sub nway {
526 my ( $value ) = shift;
527 my ( @values ) = @_;
528 my ( $val ) = 0;
529
530 $value = lc $value;
531 while (@_) {
532 return $val if $value eq shift;
533 $val++;
534 }
535 die "Expected one of ".join(", ",@values).", got \"$value\"\n";
536}
537
538
539sub stat {
540
541 my $file = shift;
542
543 if (exists($stat{$file})) {
544 print STDERR "++Cache hit: $file\n" if $debug;
545 } else {
546 stat($file);
547 if (-e _) {
548 $stat{$file}=(-d _)?'d':'f';
549 } else {
550 $stat{$file}='n';
551 }
552 print STDERR "--Cache miss: $file\n" if $debug;
553 }
554 return $stat{$file};
555}
556
557# ######################## Read 'state' files ##############################
558
559sub read_state {
560
561 my $reffile='.referers';
562 my $refered;
563 my @referers;
564
565 $reffile="referers" if $win32;
566
567 print STDERR "reading $reffile\n" if $verbose>0;
568 open(REFERERS,"< $reffile") ||
569 die "Could not open $reffile for reading: $!\n";
570
571 while (<REFERERS>) {
572 chomp;
573 ($refered,undef,@referers) = split(/\s+/);
574 $rum_referers{$refered}= [ @referers ];
575 # print STDERR $refered," <- ",join(' and ',@referers),"\n";
576 }
577
578 close(REFERERS);
579
580
581 # Read redirection report
582
583 my $redirfile='.redirs';
584 my $wrong;
585 my $right;
586 my $tmp;
587
588 $redirfile="redirs" if $win32;
589
590 print STDERR "reading $redirfile\n" if $verbose>0;
591 open(REDIRS,"< $redirfile") ||
592 die "Could not open $redirfile for reading: $!\n";
593
594 while (<REDIRS>) {
595 chomp;
596 ($wrong,undef,$right) = split(/\s+/);
597
598 $rum_redirected{$wrong}=$right;
599 }
600
601 close(REDIRS);
602}
603
604# ######################### Process every single tag ########################
605
606sub process_tag {
607 # Process a tag in html file
608 my $lf_referer = shift;
609 my $base_url = shift;
610 my $tag_name = shift;
611 my $url_attrs = shift;
612
613 # Retrun quickly if no URL attributes
614 return unless defined($url_attrs);
615
616 my $attrs = shift;
617
618 # Information loss through apply or processing in this procedure?
619 my $il = $infoloss;
620 my $redirs;
621 my $stat;
622
623 my $rum_url; # The absolute URL
624 my $lf_url; # The local filesystem url
625 my $lf_url_o; # ... and it's object
626 my $key;
627 my $orig_rum_url;
628# my $debug = 1;
629
630# print STDERR "\nProcess Tag: $tag_name, URL attributes: ", join(', ',@{$url_attrs}),"\nOrigin:",$base_url,"\n"; # if $debug>2;
631
632 substr($lf_referer,0,0)='./' unless substr($lf_referer,0,1) eq '/';
633
634 foreach $key (@{$url_attrs}) {
635 if (defined($$attrs{$key})) {
636 $orig_rum_url=$rum_url=$$attrs{$key};
637
638 print STDERR "\n$key = $rum_url\n" if $debug;
639
640 # Apply redirects:
641 $redirs=0;
642 if (exists($rum_redirected{$rum_url})) {
643 $il=1;
644 while (exists($rum_redirected{$rum_url})) {
645 die "Too many redirects in a row\n" if $redirs++>32;
646 print STDERR "$rum_url -> ".$rum_redirected{$rum_url}."\n"
647 if $debug;
648 $rum_url=$rum_redirected{$rum_url};
649 }
650 }
651
652 # Apply program/user apply rules
653 $lf_url=apply($rum_url);
654
655 if (defined($lf_url)) {
656 # Apply directory/file check here
657 $stat=&stat($lf_url);
658
659 if ($stat eq 'f' && $lf_url =~ m~/$~) {
660 # It's a file, remove trailing /
661 print STDERR "****** File / fixup of $lf_url\n" if $debug;
662 $il=1;
663 substr($lf_url,length($lf_url)) = '';
664 } elsif ($stat eq 'd' && !($lf_url =~ m~/$~) ) {
665 # It's a directory, add a trailing /
666 print STDERR "****** Directory / fixup of $lf_url\n" if $debug;
667 $il=1;
668 $lf_url .= '/';
669 }
670
671 substr($lf_url,0,0)='./' unless substr($lf_url,0,1) eq '/';
672
673 $lf_url_o=url $lf_url;
674 my $tmp=$lf_url_o->clone;
675 $tmp->basename($indexname);
676
677 if ( $doindex && $lf_url_o->basename eq "" &&
678 &stat($tmp->unix_path) eq 'f' &&
679 !$htmlop::isdir{$key}) {
680 $lf_url_o=$tmp;
681 $il=1;
682 print STDERR "indexname adjusted to ",$lf_url_o->as_string,"\n"
683 if $debug;
684 }
685
686 # Save new value in the hash, make it a file url to get 'rel' working
687 $lf_url_o->scheme('file');
688 $$attrs{$key} = ($lf_url_o->rel("file:".$lf_referer))->as_string;
689
690 printf STDERR "Saved ".$$attrs{$key}."\n" if $debug;
691
692 # If there is potential information loss save the old value too
693 $$attrs{"W3MIR".$key}=$orig_rum_url if $il;
694
695 } elsif ($redirs>0) {
696 $$attrs{$key}=$rum_url;
697 print STDERR "Saved ".$$attrs{$key}."\n" if $debug;
698 $$attrs{"W3MIR".$key}=$orig_rum_url;
699 }
700 }
701 }
702}
703
704
705# ###################### Edit the URLs in one file... ########################
706
707sub edit_html_file {
708 # Check if it's a html file. I know this tag is in all html
709 # files, because w3mir put it there.
710
711 my($lf_url)=shift;
712 my($rum_url)=shift;
713
714 # Figure out the filename for our local filesystem.
715 $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq '';
716
717 # Stuff in need of unquoting?
718 $lf_url = (url "file:$lf_url")->local_path if $lf_url =~ /\%\d\d/;
719
720 if (exists($lf_edited{$lf_url})) {
721 print STDERR "Already edited $lf_url\n" if $debug;
722 return ;
723 }
724
725 $lf_edited{$lf_url}=1;
726
727 my $page;
728 my $newpage;
729 my $read;
730 my $atime;
731 my $mtime;
732
733 # dev uno mode nlink uid gid rdev size atime mtime
734 (undef,undef,undef,undef,undef,undef,undef,undef,$atime,$mtime)
735 = stat($lf_url);
736
737 if (!open(TMPF,"< $lf_url\n")) {
738 warn "Cannot read $lf_url: $!\n" if $verbose>=0;
739 return;
740 }
741
742 $read=sysread(TMPF,$page,10240,0);
743 close(TMPF);
744
745 if (! $page =~ /<HTML/i) {
746 print STDERR "$lf_url is not html\n" if $verbose>0;
747 return ;
748 }
749
750 $files++;
751
752 print STDERR "w3mfix: $lf_url" if $verbose>=0;
753
754 print STDERR "$lf_url is a html file\n" if $debug;
755
756 print STDERR " reading" if $verbose>0;
757
758 open(TMPF,$lf_url) ||
759 die "Could not open $lf_url for reading: $!\n";
760 # read the whole file.
761 {
762 local($/)=undef;
763 $page = <TMPF>;
764 }
765 close(TMPF);
766
767 print STDERR " ",length($page)," bytes" if $verbose>0;
768
769 # It's a html document
770
771 print STDERR ", editing" if $verbose>0;
772
773 ($newpage,undef) =
774 &htmlop::process($page, # $htmlop::NODOC,
775 $htmlop::ABS,$rum_url,
776 $htmlop::USESAVED,'W3MIR',
777 $htmlop::TAGCALLBACK,\&process_tag,$lf_url);
778
779 open(TMPF,">$lf_url") ||
780 die "\nCould not open $lf_url for writing: $!\n";
781
782 print STDERR ", saving" if $verbose>0;
783
784 if (length($newpage)) {
785 # This is ODD: close does not seem to flush the buffers. So we
786 # force the issue.
787 local($|)=1;
788 print TMPF $newpage ||
789 die "\nCould not write to $lf_url (disk full?): $!\n";
790 }
791
792 close(TMPF) ||
793 die "\nCould not close $lf_url after writing: $!\n";
794
795 # Set times back to what they were.
796 utime $atime,$mtime,$lf_url;
797
798 print STDERR ".\n" if $verbose>=0;
799}
800
801# ############################### Scope test
802
803sub want_this {
804 # Find out if we want the url passed. Just pass it on to the
805 # generated functions.
806 my($rum_url)=shift;
807
808 # Does scope rule want this?
809 return &$scope_code($rum_url) &&
810 # Does user rule want this too?
811 &$rule_code($rum_url)
812
813}
814
815# ############################### Apply the apply rules
816
817sub user_apply {
818 # Apply the user apply rules
819 return &$user_apply_code(shift);
820}
821
822
823sub internal_apply {
824 # Apply the w3mir generated apply rules
825 return &$apply_code(shift);
826}
827
828
829sub apply {
830 # Apply the user apply rules. Then if URL is wanted return result of
831 # w3mir apply rules. Return the undefined value otherwise.
832
833 my $url = user_apply(shift);
834
835 return undef unless want_this($url);
836
837 internal_apply($url);
838}
839
840# ############################### Decide what URLs to edit
841
842sub edit_as_needed {
843 my $rum_redirected;
844 my $rum_url;
845 my $o_rum_url;
846 my $rum_referer;
847 my $lf_url;
848 my $foo;
849 my $redirs;
850
851 if ($editthis) {
852
853 # Find the URLs that match $edithis
854 foreach $o_rum_url (keys %rum_referers) {
855 # Work on them if they (now) fall within the scope of retrival
856 $redirs=0;
857 $rum_url=$o_rum_url;
858 if (exists($rum_redirected{$o_rum_url})) {
859 while (exists($rum_redirected{$rum_url})) {
860 die "Too many redirects in a row\n" if $redirs++>32;
861 print STDERR "$rum_url -> ".$rum_redirected{$rum_url}."\n"
862 if $debug;
863 $rum_url=$rum_redirected{$rum_url};
864 }
865 }
866 next unless $rum_url =~ /$editthis/io;
867 next unless want_this($rum_url);
868 # Find and edit the documents containing references to $o_rum_url
869 foreach $rum_referer (@{$rum_referers{$o_rum_url}}) {
870 next if $rum_referer eq '(commandline)';
871 $lf_url=apply($rum_referer);
872 next unless defined($lf_url);
873 edit_html_file($lf_url,$rum_referer);
874 }
875 }
876
877 # Don't do anything else when invoked thus
878 return;
879 }
880
881 if ($doindex) {
882 # Edit everything that refers anything with trailing /
883 foreach $rum_url (grep(/\/$/,keys %rum_referers)) {
884 foreach $rum_referer (@{$rum_referers{$rum_url}}) {
885 next if $rum_referer eq '(commandline)';
886 $lf_url=apply($rum_referer);
887 next unless defined($lf_url);
888 edit_html_file($lf_url,$rum_referer);
889 }
890 }
891 }
892
893 # Edit only redirected stuff
894 foreach $rum_redirected (keys %rum_redirected) {
895 # print "Redirected $rum_redirected\n";
896 foreach $rum_url (@{$rum_referers{$rum_redirected}}) {
897 # print "- Found in $rum_url\n";
898 $lf_url=apply($rum_url);
899 next unless defined($lf_url);
900 edit_html_file($lf_url,$rum_url);
901 }
902 }
903}
904
905# ############################### 'main'
906
907&parse_args(@ARGV);
908
909if (!$rc) {
910 $rc='.w3mirc';
911 $rc='w3mir.ini' if $win32;
912}
913
914print STDERR "w3mfix: rc file: $rc\n" if $verbose>0;
915
916&parse_cfg_file($rc);
917
918# $chdirto is ignored, w3mir already did it
919
920# Compile second order code
921
922# - The rum scope tests
923my $full_rules=$scope_prefix.$scope_fetch.$scope_ignore.$scope_postfix;
924eval $full_rules;
925
926# warn "Scope rules:\n-------------\n$full_rules\n---------------\n";
927
928die "Program generated rules did not compile. The code is:\n----\n".
929 $full_rules."\n----\n"
930 if !defined($scope_code);
931
932$full_rules=$rule_prefix.$rule_text.$rule_postfix;
933eval $full_rules;
934
935# warn "User rules:\n-------------\n$full_rules\n---------------\n";
936
937# - The user specified rum tests
938die "Ignore/Fetch rules did not compile. The code is:\n----\n".
939 $full_rules."\n----\n"
940 if !defined($rule_code);
941
942# - The user specified apply rules
943
944my $full_apply=$apply_prefix.($lc?$apply_lc:'').
945 join($ipost.";\n",@user_apply).(($#user_apply>=0)?$ipost:"").";\n".
946 $apply_postfix;
947eval $full_apply;
948
949die "User apply rules did not compile. The code is:
950----
951".$full_apply."
952----\n" if !defined($apply_code);
953
954$user_apply_code=$apply_code;
955
956# - The w3mir generated apply rules
957
958$full_apply=$apply_prefix.($lc?$apply_lc:'').
959 join($ipost.";\n",@internal_apply).(($#internal_apply>=0)?$ipost:"").";\n".
960 $apply_postfix;
961eval $full_apply;
962
963die "Internal apply rules did not compile. The code is:
964----
965".$full_apply."
966----\n" if !defined($apply_code);
967
968&read_state;
969
970&edit_as_needed;
971
972exit 0;
973__END__
974# -*- perl -*- There must be a blank line here:
975
976=head1 NAME
977
978w3mfix - fixup program for w3mir
979
980=head1 SYNOPSIS
981
982B<w3mfix> [B<options>] [B<configuration-file>]
983
984=head1 DESCRIPTION
985
986B<w3mfix> is the companion program to L<w3mir>. It can be used for
987several URL editing operations usefull in different situations.
988
989When starting B<w3mfix> will read it's configuration file. It's name is
990either .w3mirc (w3mir.ini on win32) or specified on the commandline.
991
992B<w3mfix> is controlled by the 'Fixup' directive of the configuration
993file (described in the L<w3mir> documentation). B<w3mfix> is also
994affected by 'Index-name' and the one special commandline option it
995knows, as well as the directives/options controlling verbosity and
996debugging information.
997
998=head1 DESCRIPTION
999
1000B<w3mfix> can rewrite URLs in these ways:
1001
1002=over 4
1003
1004=item * Rewrite URLs that resutled in redirects to point to the place
1005redirected to. This is needed in all cases and will always be fixed
1006by B<w3mfix>.
1007
1008=item * Change URLs ending in .../ into .../index.html (or
1009.../Welcome.html). This is, probably, not needed when the mirror is
1010meant do be used with a web-server. It is usefull for browsing
1011directly from disk or CDROM, but in this case it's, most often,
1012required. To disable this specify the I<noindex> option with the
1013'Fixup' directive.
1014
1015The default is to transform URLs ending in .../ into .../index.html.
1016
1017To controll the name of the index file use the I<Index-name> directive
1018as documented in L<w3mir>
1019
1020=item * Change URL links to documents outside the mirror to point to
1021some local document. Could be usefull if the mirror is destined for a
1022CDROM to be used on a unconnected machine.
1023
1024THIS IS NOT YET IMPLEMENTED
1025
1026=item * Change URL links to documents that L<w3mir> was
1027unable/forbidden to retrive to point to some local document. Pointing
1028these to a nice informative document is probably better than random
1029error messages from the browser.
1030
1031THIS IS NOT YET IMPLEMENTED
1032
1033=item * And, least, but far from last, B<w3mfix> can be used to
1034prepare an established mirror for enlargement.
1035
1036This feature is used thus: Add the new site or subsite to be mirrored
1037on the configuration file (by adding B<Also:> and B<Also-quene:>
1038directives). Then run B<w3mfix> with the B<-editref> option. When
1039the B<-editref> option is specified B<w3mfix> will not perform any
1040other editing tasks.
1041
1042E.g.; To add I<http://www.yahoo.com/Science/Artificial_Life/> to your
1043mirror add something like
1044
1045 Also: http://www.yahoo.com/Science/Artificial_Life/ yahoo
1046
1047to the configuration file, then run w3mfix:
1048
1049 w3mfix -editref www.yahoo.com/Science/Artificial_Life
1050
1051This will cause all references to
1052I<www.yahoo.com/Science/Artificial_Life> (and under) to be edited so they
1053point to within the mirror. After B<w3mfix> has finished you can run
1054L<w3mir> in the normal manner.
1055
1056=back
1057
1058=head1 BUGS
1059
1060Naah.
1061
1062=head1 SEE ALSO
1063
1064L<w3mir>
1065
1066=head1 AUTHORS
1067
1068B<w3mir>s authors can be reached at I<[email protected]>.
1069B<w3mir>s home page is at http://www.math.uio.no/~janl/w3mir/
1070
Note: See TracBrowser for help on using the repository browser.