#!/usr/bin/perl -w # Both this script and its associated process_html.pl were written by # Marcel ?, while a student at Waikato University. Unfortunately he # was very new to perl at the time so the code is neither as clean nor # as fast as it could be (I've cleaned up a few of the more serious # bottlenecks -- it could do with alot more work though). The code # does work though, if a little slowly. It's not ready for primetime # however and is included in the Greenstone source tree mostly so that # I don't lose it. -- Stefan - 24 Jul 2001 # This script rebuilds the static collection by linking all the downloaded html files # back together. # It searches through html files and replaces links it recognizes from the links.txt file # with the apropriate html file name (eg 1.html, 2.html etc) # This script also updates the links for the pictures. # This is where all the dl-ed html files are located, eg. 'temp_html/' my $outputdir = 'temp_html/'; # This is where all the processed files end up (don't want to overwrite originals ;) eg. 'my_static_collection/' my $finaldir = 'envl_collection/'; # If any options where used (such as &u=1) when the html files where dl-ed then please specify them here. my $option = "&u=1"; # Please ensure these two options match the settings used when downloading the collection :) my $dir_entries = 250; my $fix_empty_pages = '&cl=CL1'; #------------------------------------------------------------------------------------------------- # global arrays used to store links, links-index & html-filenames my %filez; my %out_filez; my %linkz; my %short_linkz1; my %short_linkz2; my %short_linkz3; my %linkz_index; #my %remove_these; my $remove_these = ""; sub processfiles { local($start_here) = @_; for my $file($start_here .. $#filez) { if((-e $filez[$file])&&(-s $filez[$file])) { open (FILE, $filez[$file]) or die "can't open ", $filez[$file],": $! \n"; print " $filez[$file] "; undef $/; my $content_of_file = ; $/ = "\n"; close(FILE); #quick & nasty fix for the 'open book' link local $quick_fix1 = "&cl=\""; local $quick_fix2 = "&cl=\'"; $content_of_file =~ s/$quick_fix1/$fix_empty_pages\"/g; $content_of_file =~ s/$quick_fix2/$fix_empty_pages\'/g; for my $link(0 .. $#linkz) { my $new_link = $linkz_index[$link].".html"; if($short_linkz3[$link] ne "") { $content_of_file =~ s/$short_linkz1[$link].*?$short_linkz2[$link].*?${short_linkz3[$link]}[^\"\'\s\>]*/$new_link/g; } else { $content_of_file =~ s/$short_linkz1[$link].*${short_linkz2[$link]}[^\"\'\s\>]*/$new_link/g; } } $content_of_file =~ s/(["'])$remove_these/$1..\//g; open (TEMP, ">temp.html") or die "can't open temp.html: $! \n"; print TEMP $content_of_file; close(TEMP); rename("temp.html", $out_filez[$file]) or die "cannot create", $out_filez[$file],": $! \n"; print " --> $out_filez[$file]"; print "..done\n"; } else { last; # bomb out of loop. Done. } } print " *** Done, cannot find any more files to process ***\n"; } # the switch variable there so that I can create a couple of additional arrays without having to write an entirely new function :-) # 0 = off, 1 = on (puts values into %linkz_index, %short_linkz1 and %short_linkz2) sub sort_array_by_length { local (*foo, $switch) = @_; my $total = $#foo; my %temp_linkz; my $shortest = 999999; my $longest = 0; if ($switch != 0) { print "Processing linkz (chopping, slicing, dicing and sorting :-)..."; } for my $counter(0 .. $total) { if (length($foo[$counter]) < $shortest) { $shortest = length($foo[$counter]); $temp_linkz[$total] = $foo[$counter]; } if (length($foo[$counter]) > $longest) { $longest = length($foo[$counter]); } } $backward = $total; for my $l($shortest .. $longest) { local $numberdir = 0; for my $counter(0 .. $total) { if ($counter % $dir_entries == 0) { $numberdir = $counter; } if(length($foo[$counter]) == $l) { $temp_linkz[$backward] = $foo[$counter]; if ($switch != 0) { $linkz_index[$backward] = "../".$numberdir."/".$counter; my $d_offset = 0; for my $search(0 .. (length($foo[$counter]) - 3)) { if((substr($foo[$counter], $search, 3) eq '?e=')||(substr($foo[$counter], $search, 3) eq '&e=')) { $short_linkz1[$backward] = substr($foo[$counter], 0, $search); } for my $second_search($search .. length($foo[$counter])) { if((substr($foo[$counter], $second_search, 3) eq '?d=')||(substr($foo[$counter], $second_search, 3) eq '&d=')) { $short_linkz3[$backward] = substr($foo[$counter], $second_search); $d_offset = $second_search; last; } } if(substr($foo[$counter], $search, 3) eq '?a=') { $short_linkz1[$backward] = substr($foo[$counter], 0, $search); if($d_offset > 0) { $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search); } else { $short_linkz2[$backward] = substr($foo[$counter], $search); } } if(substr($foo[$counter], $search, 3) eq '&a=') { if($d_offset > 0) { $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search); } else { $short_linkz2[$backward] = substr($foo[$counter], $search); } } } } $backward--; } } } # copy the sorted temp_array over the original array (must be a better way of doing this :\ ) for my $counter(0 .. $total) { $foo[$counter] = $temp_linkz[$counter]; } if ($switch != 0) { print "done!\n"; } } sub how_much_to_chop { local($link) = @_; my $bracket_counter = 0; my $chop_offset = 0; for my $search(0 .. length($link)) { if (substr($link, $search, 1) eq '/') { $bracket_counter++; } if ($bracket_counter == 2) { $chop_offset = $search + 1; } } return $chop_offset; } my $start_time = (times)[0]; #----------------------------------------------------------------------------------------------- # No need to start from scratch everytime, we can recover/continue from wherever we left off # simply by checking which html files have been created #----------------------------------------------------------------------------------------------- my $linknumber = 0; my $failed = 0; my $check_file = ""; my $numberdir = 0; if($outputdir ne $finaldir) { while ($failed == 0) { if ($linknumber % $dir_entries == 0) { if (!((-e $finaldir.$linknumber)&&(-d $finaldir.$linknumber))) { $failed++; mkdir($finaldir.$linknumber, 0777) or die " ** Cannot create ",$finaldir.$linknumber, "!: $!\n"; } $numberdir = $linknumber; } $check_file = $finaldir.$numberdir."/".$linknumber.".html"; if ((-e $check_file)&&($failed == 0)) { $linknumber++; } else { $failed++; # I'm subtracting 1 from the starting link, # just in case it only loaded half the page ;^) if($linknumber>0) { $linknumber--; } print " Will start processing at number $linknumber \n"; } } } my $i = 0; my $that = ""; my $offset = 0; #read in old links from links text file open (CHECK, "links.txt") || die " ** Cannot find/open links text file!: $!\n"; while (defined ($that = )) { if ($i == 0) { #chop off the first bit $offset = &how_much_to_chop($that); print " Offset has been set to: ",$offset,"\n"; print " This next bit will be ignored for all links in the links.txt file:\n"; print " -->",substr($that,0,$offset),"<--\n"; } $that = substr($that, $offset); #Wipe-out the EOL character # if (substr($that, -1) eq "\n") { substr($that, -1) = ""; } chomp $that; #this wipes the options # if (length($option) != 0) # { # substr($that, (length($option)) * -1) = ""; # } $that =~ s/$option//; $linkz[$i] = $that; $short_linkz1[$i] = ""; $short_linkz2[$i] = ""; $short_linkz3[$i] = ""; for my $search(0 .. (length($that) - 3)) { if((substr($that, $search, 3) eq '?e=')||(substr($that, $search, 3) eq '&e=')) { $short_linkz1[$i] = substr($that, 0, $search); } if(substr($that, $search, 3) eq '?a=') { $short_linkz1[$i] = substr($that, 0, $search); $short_linkz2[$i] = substr($that, $search); } if(substr($that, $search, 3) eq '&a=') { $short_linkz2[$i] = substr($that, $search); } } $i++; if ($i % $dir_entries == 0) { if (!((-e $finaldir.$i)&&(-d $finaldir.$i))) { mkdir($finaldir.$i, 0777) or die " ** Cannot create ",$finaldir.$i, "!: $!\n"; } } } close(CHECK); print " - I found ",$i, " links in the links text file -\n"; &sort_array_by_length(*linkz, 1); $numberdir = 0; for my $z(0 .. ($i - 1)) { if($z % $dir_entries == 0) { $numberdir = $z; } $filez[$z] = $outputdir.$numberdir."/".$z.".html"; $out_filez[$z] = $finaldir.$numberdir."/".$z.".html"; } # ..and last but not least, load any image_dirs from image_dirs.txt my $imd_that = ""; #my $image_dirs_pointer = 0; my @tmp_arr = (); open (IMAGE_DIR, "image_dirs.txt") || die " ** HEY! Cannot find/open image_dirs.txt file! : $! **\n"; while(defined ($imd_that = )) { chomp $imd_that; push(@tmp_array, $imd_that); } close IMAGE_DIR; $remove_these = "(" . join ("|", sort {length $b <=> length $a} @tmp_array) . ")"; #print " - I found ",($#remove_these + 1)," picture directories in image_dirs.txt -\n"; #&sort_array_by_length(*remove_these, 0); print "-" x 20, "\n"; print " Here we go...\n"; print "-" x 20, "\n"; &processfiles($linknumber); my $end_time = (times)[0]; print "\n\n\n *----------------------------*\n"; print " | Whew! Task completed! :-D |\n"; print " *----------------------------*\n"; printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time; print "\n\n"; print " Now there's a few things left to do...load up ",$finaldir, "0/0.html in your webbrowser and\n"; print " make sure everything works.\n"; print " The grab_collection script will have generated 3 text files that can be removed, namely:\n"; print " - links.txt \n"; print " - images.txt \n"; print " - image_dirs.txt \n\n"; if ($outputdir ne $finaldir) { print "And then finally you can also delete the ",$outputdir," directory.\n\n"; }