1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 |
|
---|
4 | # Both this script and its associated process_html.pl were written by
|
---|
5 | # Marcel ?, while a student at Waikato University. Unfortunately he
|
---|
6 | # was very new to perl at the time so the code is neither as clean nor
|
---|
7 | # as fast as it could be (I've cleaned up a few of the more serious
|
---|
8 | # bottlenecks -- it could do with alot more work though). The code
|
---|
9 | # does work though, if a little slowly. It's not ready for primetime
|
---|
10 | # however and is included in the Greenstone source tree mostly so that
|
---|
11 | # I don't lose it. -- Stefan - 24 Jul 2001
|
---|
12 |
|
---|
13 |
|
---|
14 | # This script rebuilds the static collection by linking all the downloaded html files
|
---|
15 | # back together.
|
---|
16 | # It searches through html files and replaces links it recognizes from the links.txt file
|
---|
17 | # with the apropriate html file name (eg 1.html, 2.html etc)
|
---|
18 | # This script also updates the links for the pictures.
|
---|
19 |
|
---|
20 | # This is where all the dl-ed html files are located, eg. 'temp_html/'
|
---|
21 | my $outputdir = 'temp_html/';
|
---|
22 |
|
---|
23 | # This is where all the processed files end up (don't want to overwrite originals ;) eg. 'my_static_collection/'
|
---|
24 | my $finaldir = 'envl_collection/';
|
---|
25 |
|
---|
26 | # If any options where used (such as &u=1) when the html files where dl-ed then please specify them here.
|
---|
27 | my $option = "&u=1";
|
---|
28 |
|
---|
29 | # Please ensure these two options match the settings used when downloading the collection :)
|
---|
30 | my $dir_entries = 250;
|
---|
31 | my $fix_empty_pages = '&cl=CL1';
|
---|
32 |
|
---|
33 | #-------------------------------------------------------------------------------------------------
|
---|
34 |
|
---|
35 | # global arrays used to store links, links-index & html-filenames
|
---|
36 | my %filez;
|
---|
37 | my %out_filez;
|
---|
38 | my %linkz;
|
---|
39 | my %short_linkz1;
|
---|
40 | my %short_linkz2;
|
---|
41 | my %short_linkz3;
|
---|
42 | my %linkz_index;
|
---|
43 | #my %remove_these;
|
---|
44 | my $remove_these = "";
|
---|
45 |
|
---|
46 | sub processfiles
|
---|
47 | {
|
---|
48 | local($start_here) = @_;
|
---|
49 |
|
---|
50 | for my $file($start_here .. $#filez)
|
---|
51 | {
|
---|
52 | if((-e $filez[$file])&&(-s $filez[$file]))
|
---|
53 | {
|
---|
54 | open (FILE, $filez[$file]) or die "can't open ", $filez[$file],": $! \n";
|
---|
55 |
|
---|
56 | print " $filez[$file] ";
|
---|
57 |
|
---|
58 | undef $/;
|
---|
59 | my $content_of_file = <FILE>;
|
---|
60 | $/ = "\n";
|
---|
61 | close(FILE);
|
---|
62 |
|
---|
63 | #quick & nasty fix for the 'open book' link
|
---|
64 | local $quick_fix1 = "&cl=\"";
|
---|
65 | local $quick_fix2 = "&cl=\'";
|
---|
66 |
|
---|
67 | $content_of_file =~ s/$quick_fix1/$fix_empty_pages\"/g;
|
---|
68 | $content_of_file =~ s/$quick_fix2/$fix_empty_pages\'/g;
|
---|
69 |
|
---|
70 | for my $link(0 .. $#linkz)
|
---|
71 | {
|
---|
72 | my $new_link = $linkz_index[$link].".html";
|
---|
73 |
|
---|
74 | if($short_linkz3[$link] ne "")
|
---|
75 | {
|
---|
76 | $content_of_file =~ s/$short_linkz1[$link].*?$short_linkz2[$link].*?${short_linkz3[$link]}[^\"\'\s\>]*/$new_link/g;
|
---|
77 | }
|
---|
78 | else
|
---|
79 | {
|
---|
80 | $content_of_file =~ s/$short_linkz1[$link].*${short_linkz2[$link]}[^\"\'\s\>]*/$new_link/g;
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | $content_of_file =~ s/(["'])$remove_these/$1..\//g;
|
---|
85 | open (TEMP, ">temp.html") or die "can't open temp.html: $! \n";
|
---|
86 | print TEMP $content_of_file;
|
---|
87 | close(TEMP);
|
---|
88 | rename("temp.html", $out_filez[$file]) or die "cannot create", $out_filez[$file],": $! \n";
|
---|
89 | print " --> $out_filez[$file]";
|
---|
90 | print "..done\n";
|
---|
91 | }
|
---|
92 | else
|
---|
93 | {
|
---|
94 | last; # bomb out of loop. Done.
|
---|
95 | }
|
---|
96 | }
|
---|
97 | print " *** Done, cannot find any more files to process ***\n";
|
---|
98 | }
|
---|
99 |
|
---|
100 | # the switch variable there so that I can create a couple of additional arrays without having to write an entirely new function :-)
|
---|
101 | # 0 = off, 1 = on (puts values into %linkz_index, %short_linkz1 and %short_linkz2)
|
---|
102 | sub sort_array_by_length
|
---|
103 | {
|
---|
104 | local (*foo, $switch) = @_;
|
---|
105 | my $total = $#foo;
|
---|
106 | my %temp_linkz;
|
---|
107 | my $shortest = 999999;
|
---|
108 | my $longest = 0;
|
---|
109 |
|
---|
110 | if ($switch != 0)
|
---|
111 | {
|
---|
112 | print "Processing linkz (chopping, slicing, dicing and sorting :-)...";
|
---|
113 | }
|
---|
114 |
|
---|
115 | for my $counter(0 .. $total)
|
---|
116 | {
|
---|
117 | if (length($foo[$counter]) < $shortest)
|
---|
118 | {
|
---|
119 | $shortest = length($foo[$counter]);
|
---|
120 | $temp_linkz[$total] = $foo[$counter];
|
---|
121 | }
|
---|
122 | if (length($foo[$counter]) > $longest)
|
---|
123 | {
|
---|
124 | $longest = length($foo[$counter]);
|
---|
125 | }
|
---|
126 | }
|
---|
127 |
|
---|
128 | $backward = $total;
|
---|
129 | for my $l($shortest .. $longest)
|
---|
130 | {
|
---|
131 | local $numberdir = 0;
|
---|
132 | for my $counter(0 .. $total)
|
---|
133 | {
|
---|
134 | if ($counter % $dir_entries == 0)
|
---|
135 | {
|
---|
136 | $numberdir = $counter;
|
---|
137 | }
|
---|
138 |
|
---|
139 | if(length($foo[$counter]) == $l)
|
---|
140 | {
|
---|
141 | $temp_linkz[$backward] = $foo[$counter];
|
---|
142 | if ($switch != 0)
|
---|
143 | {
|
---|
144 | $linkz_index[$backward] = "../".$numberdir."/".$counter;
|
---|
145 | my $d_offset = 0;
|
---|
146 | for my $search(0 .. (length($foo[$counter]) - 3))
|
---|
147 | {
|
---|
148 | if((substr($foo[$counter], $search, 3) eq '?e=')||(substr($foo[$counter], $search, 3) eq '&e='))
|
---|
149 | {
|
---|
150 | $short_linkz1[$backward] = substr($foo[$counter], 0, $search);
|
---|
151 | }
|
---|
152 |
|
---|
153 | for my $second_search($search .. length($foo[$counter]))
|
---|
154 | {
|
---|
155 | if((substr($foo[$counter], $second_search, 3) eq '?d=')||(substr($foo[$counter], $second_search, 3) eq '&d='))
|
---|
156 | {
|
---|
157 | $short_linkz3[$backward] = substr($foo[$counter], $second_search);
|
---|
158 | $d_offset = $second_search;
|
---|
159 | last;
|
---|
160 | }
|
---|
161 | }
|
---|
162 |
|
---|
163 | if(substr($foo[$counter], $search, 3) eq '?a=')
|
---|
164 | {
|
---|
165 | $short_linkz1[$backward] = substr($foo[$counter], 0, $search);
|
---|
166 | if($d_offset > 0)
|
---|
167 | {
|
---|
168 | $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
|
---|
169 | }
|
---|
170 | else
|
---|
171 | {
|
---|
172 | $short_linkz2[$backward] = substr($foo[$counter], $search);
|
---|
173 | }
|
---|
174 | }
|
---|
175 |
|
---|
176 | if(substr($foo[$counter], $search, 3) eq '&a=')
|
---|
177 | {
|
---|
178 | if($d_offset > 0)
|
---|
179 | {
|
---|
180 | $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
|
---|
181 | }
|
---|
182 | else
|
---|
183 | {
|
---|
184 | $short_linkz2[$backward] = substr($foo[$counter], $search);
|
---|
185 | }
|
---|
186 | }
|
---|
187 | }
|
---|
188 | }
|
---|
189 | $backward--;
|
---|
190 | }
|
---|
191 | }
|
---|
192 | }
|
---|
193 | # copy the sorted temp_array over the original array (must be a better way of doing this :\ )
|
---|
194 | for my $counter(0 .. $total)
|
---|
195 | {
|
---|
196 | $foo[$counter] = $temp_linkz[$counter];
|
---|
197 | }
|
---|
198 | if ($switch != 0)
|
---|
199 | {
|
---|
200 | print "done!\n";
|
---|
201 | }
|
---|
202 | }
|
---|
203 |
|
---|
204 | sub how_much_to_chop
|
---|
205 | {
|
---|
206 | local($link) = @_;
|
---|
207 | my $bracket_counter = 0;
|
---|
208 | my $chop_offset = 0;
|
---|
209 |
|
---|
210 | for my $search(0 .. length($link))
|
---|
211 | {
|
---|
212 | if (substr($link, $search, 1) eq '/')
|
---|
213 | {
|
---|
214 | $bracket_counter++;
|
---|
215 | }
|
---|
216 | if ($bracket_counter == 2)
|
---|
217 | {
|
---|
218 | $chop_offset = $search + 1;
|
---|
219 | }
|
---|
220 | }
|
---|
221 | return $chop_offset;
|
---|
222 | }
|
---|
223 |
|
---|
224 | my $start_time = (times)[0];
|
---|
225 |
|
---|
226 | #-----------------------------------------------------------------------------------------------
|
---|
227 | # No need to start from scratch everytime, we can recover/continue from wherever we left off
|
---|
228 | # simply by checking which html files have been created
|
---|
229 | #-----------------------------------------------------------------------------------------------
|
---|
230 | my $linknumber = 0;
|
---|
231 | my $failed = 0;
|
---|
232 | my $check_file = "";
|
---|
233 | my $numberdir = 0;
|
---|
234 |
|
---|
235 | if($outputdir ne $finaldir)
|
---|
236 | {
|
---|
237 | while ($failed == 0)
|
---|
238 | {
|
---|
239 | if ($linknumber % $dir_entries == 0)
|
---|
240 | {
|
---|
241 | if (!((-e $finaldir.$linknumber)&&(-d $finaldir.$linknumber)))
|
---|
242 | {
|
---|
243 | $failed++;
|
---|
244 | mkdir($finaldir.$linknumber, 0777) or die " ** Cannot create ",$finaldir.$linknumber, "!: $!\n";
|
---|
245 | }
|
---|
246 | $numberdir = $linknumber;
|
---|
247 | }
|
---|
248 |
|
---|
249 | $check_file = $finaldir.$numberdir."/".$linknumber.".html";
|
---|
250 | if ((-e $check_file)&&($failed == 0))
|
---|
251 | {
|
---|
252 | $linknumber++;
|
---|
253 | }
|
---|
254 | else
|
---|
255 | {
|
---|
256 | $failed++;
|
---|
257 | # I'm subtracting 1 from the starting link,
|
---|
258 | # just in case it only loaded half the page ;^)
|
---|
259 | if($linknumber>0)
|
---|
260 | {
|
---|
261 | $linknumber--;
|
---|
262 | }
|
---|
263 | print " Will start processing at number $linknumber \n";
|
---|
264 | }
|
---|
265 | }
|
---|
266 | }
|
---|
267 | my $i = 0;
|
---|
268 | my $that = "";
|
---|
269 | my $offset = 0;
|
---|
270 |
|
---|
271 | #read in old links from links text file
|
---|
272 | open (CHECK, "links.txt") || die " ** Cannot find/open links text file!: $!\n";
|
---|
273 | while (defined ($that = <CHECK>)) {
|
---|
274 |
|
---|
275 | if ($i == 0)
|
---|
276 | {
|
---|
277 | #chop off the first bit
|
---|
278 | $offset = &how_much_to_chop($that);
|
---|
279 | print " Offset has been set to: ",$offset,"\n";
|
---|
280 | print " This next bit will be ignored for all links in the links.txt file:\n";
|
---|
281 | print " -->",substr($that,0,$offset),"<--\n";
|
---|
282 | }
|
---|
283 |
|
---|
284 | $that = substr($that, $offset);
|
---|
285 |
|
---|
286 | #Wipe-out the EOL character
|
---|
287 | # if (substr($that, -1) eq "\n") { substr($that, -1) = ""; }
|
---|
288 | chomp $that;
|
---|
289 |
|
---|
290 | #this wipes the options
|
---|
291 | # if (length($option) != 0)
|
---|
292 | # {
|
---|
293 | # substr($that, (length($option)) * -1) = "";
|
---|
294 | # }
|
---|
295 | $that =~ s/$option//;
|
---|
296 |
|
---|
297 | $linkz[$i] = $that;
|
---|
298 |
|
---|
299 | $short_linkz1[$i] = "";
|
---|
300 | $short_linkz2[$i] = "";
|
---|
301 | $short_linkz3[$i] = "";
|
---|
302 |
|
---|
303 | for my $search(0 .. (length($that) - 3))
|
---|
304 | {
|
---|
305 | if((substr($that, $search, 3) eq '?e=')||(substr($that, $search, 3) eq '&e='))
|
---|
306 | {
|
---|
307 | $short_linkz1[$i] = substr($that, 0, $search);
|
---|
308 | }
|
---|
309 |
|
---|
310 | if(substr($that, $search, 3) eq '?a=')
|
---|
311 | {
|
---|
312 | $short_linkz1[$i] = substr($that, 0, $search);
|
---|
313 | $short_linkz2[$i] = substr($that, $search);
|
---|
314 | }
|
---|
315 | if(substr($that, $search, 3) eq '&a=')
|
---|
316 | {
|
---|
317 | $short_linkz2[$i] = substr($that, $search);
|
---|
318 | }
|
---|
319 | }
|
---|
320 | $i++;
|
---|
321 |
|
---|
322 | if ($i % $dir_entries == 0)
|
---|
323 | {
|
---|
324 | if (!((-e $finaldir.$i)&&(-d $finaldir.$i)))
|
---|
325 | {
|
---|
326 | mkdir($finaldir.$i, 0777) or die " ** Cannot create ",$finaldir.$i, "!: $!\n";
|
---|
327 | }
|
---|
328 | }
|
---|
329 | }
|
---|
330 | close(CHECK);
|
---|
331 |
|
---|
332 | print " - I found ",$i, " links in the links text file -\n";
|
---|
333 |
|
---|
334 | &sort_array_by_length(*linkz, 1);
|
---|
335 |
|
---|
336 | $numberdir = 0;
|
---|
337 |
|
---|
338 | for my $z(0 .. ($i - 1))
|
---|
339 | {
|
---|
340 | if($z % $dir_entries == 0)
|
---|
341 | {
|
---|
342 | $numberdir = $z;
|
---|
343 | }
|
---|
344 | $filez[$z] = $outputdir.$numberdir."/".$z.".html";
|
---|
345 | $out_filez[$z] = $finaldir.$numberdir."/".$z.".html";
|
---|
346 | }
|
---|
347 |
|
---|
348 | # ..and last but not least, load any image_dirs from image_dirs.txt
|
---|
349 | my $imd_that = "";
|
---|
350 | #my $image_dirs_pointer = 0;
|
---|
351 |
|
---|
352 | my @tmp_arr = ();
|
---|
353 | open (IMAGE_DIR, "image_dirs.txt") || die " ** HEY! Cannot find/open image_dirs.txt file! : $! **\n";
|
---|
354 | while(defined ($imd_that = <IMAGE_DIR>))
|
---|
355 | {
|
---|
356 | chomp $imd_that;
|
---|
357 | push(@tmp_array, $imd_that);
|
---|
358 | }
|
---|
359 | close IMAGE_DIR;
|
---|
360 |
|
---|
361 | $remove_these = "(" . join ("|", sort {length $b <=> length $a} @tmp_array) . ")";
|
---|
362 |
|
---|
363 | #print " - I found ",($#remove_these + 1)," picture directories in image_dirs.txt -\n";
|
---|
364 | #&sort_array_by_length(*remove_these, 0);
|
---|
365 |
|
---|
366 | print "-" x 20, "\n";
|
---|
367 | print " Here we go...\n";
|
---|
368 | print "-" x 20, "\n";
|
---|
369 |
|
---|
370 | &processfiles($linknumber);
|
---|
371 |
|
---|
372 | my $end_time = (times)[0];
|
---|
373 | print "\n\n\n *----------------------------*\n";
|
---|
374 | print " | Whew! Task completed! :-D |\n";
|
---|
375 | print " *----------------------------*\n";
|
---|
376 | printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
|
---|
377 | print "\n\n";
|
---|
378 | print " Now there's a few things left to do...load up ",$finaldir, "0/0.html in your webbrowser and\n";
|
---|
379 | print " make sure everything works.\n";
|
---|
380 | print " The grab_collection script will have generated 3 text files that can be removed, namely:\n";
|
---|
381 | print " - links.txt \n";
|
---|
382 | print " - images.txt \n";
|
---|
383 | print " - image_dirs.txt \n\n";
|
---|
384 | if ($outputdir ne $finaldir)
|
---|
385 | {
|
---|
386 | print "And then finally you can also delete the ",$outputdir," directory.\n\n";
|
---|
387 | }
|
---|
388 |
|
---|
389 |
|
---|
390 |
|
---|
391 |
|
---|
392 |
|
---|
393 |
|
---|