source: main/trunk/greenstone2/bin/script/process_html.pl@ 24375

Last change on this file since 24375 was 2668, checked in by sjboddie, 23 years ago

Added Marcel's static collection building scripts to the source tree. These
aren't really expected to be any use to anyone yet, they're included mostly
so that I don't lose them.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1#!/usr/bin/perl -w
2
3
4# Both this script and its associated process_html.pl were written by
5# Marcel ?, while a student at Waikato University. Unfortunately he
6# was very new to perl at the time so the code is neither as clean nor
7# as fast as it could be (I've cleaned up a few of the more serious
8# bottlenecks -- it could do with alot more work though). The code
9# does work though, if a little slowly. It's not ready for primetime
10# however and is included in the Greenstone source tree mostly so that
11# I don't lose it. -- Stefan - 24 Jul 2001
12
13
14# This script rebuilds the static collection by linking all the downloaded html files
15# back together.
16# It searches through html files and replaces links it recognizes from the links.txt file
17# with the apropriate html file name (eg 1.html, 2.html etc)
18# This script also updates the links for the pictures.
19
20# This is where all the dl-ed html files are located, eg. 'temp_html/'
21my $outputdir = 'temp_html/';
22
23# This is where all the processed files end up (don't want to overwrite originals ;) eg. 'my_static_collection/'
24my $finaldir = 'envl_collection/';
25
26# If any options where used (such as &u=1) when the html files where dl-ed then please specify them here.
27my $option = "&u=1";
28
29# Please ensure these two options match the settings used when downloading the collection :)
30my $dir_entries = 250;
31my $fix_empty_pages = '&cl=CL1';
32
33#-------------------------------------------------------------------------------------------------
34
35# global arrays used to store links, links-index & html-filenames
36my %filez;
37my %out_filez;
38my %linkz;
39my %short_linkz1;
40my %short_linkz2;
41my %short_linkz3;
42my %linkz_index;
43#my %remove_these;
44my $remove_these = "";
45
46sub processfiles
47{
48 local($start_here) = @_;
49
50 for my $file($start_here .. $#filez)
51 {
52 if((-e $filez[$file])&&(-s $filez[$file]))
53 {
54 open (FILE, $filez[$file]) or die "can't open ", $filez[$file],": $! \n";
55
56 print " $filez[$file] ";
57
58 undef $/;
59 my $content_of_file = <FILE>;
60 $/ = "\n";
61 close(FILE);
62
63 #quick & nasty fix for the 'open book' link
64 local $quick_fix1 = "&cl=\"";
65 local $quick_fix2 = "&cl=\'";
66
67 $content_of_file =~ s/$quick_fix1/$fix_empty_pages\"/g;
68 $content_of_file =~ s/$quick_fix2/$fix_empty_pages\'/g;
69
70 for my $link(0 .. $#linkz)
71 {
72 my $new_link = $linkz_index[$link].".html";
73
74 if($short_linkz3[$link] ne "")
75 {
76 $content_of_file =~ s/$short_linkz1[$link].*?$short_linkz2[$link].*?${short_linkz3[$link]}[^\"\'\s\>]*/$new_link/g;
77 }
78 else
79 {
80 $content_of_file =~ s/$short_linkz1[$link].*${short_linkz2[$link]}[^\"\'\s\>]*/$new_link/g;
81 }
82 }
83
84 $content_of_file =~ s/(["'])$remove_these/$1..\//g;
85 open (TEMP, ">temp.html") or die "can't open temp.html: $! \n";
86 print TEMP $content_of_file;
87 close(TEMP);
88 rename("temp.html", $out_filez[$file]) or die "cannot create", $out_filez[$file],": $! \n";
89 print " --> $out_filez[$file]";
90 print "..done\n";
91 }
92 else
93 {
94 last; # bomb out of loop. Done.
95 }
96 }
97 print " *** Done, cannot find any more files to process ***\n";
98}
99
100# the switch variable there so that I can create a couple of additional arrays without having to write an entirely new function :-)
101# 0 = off, 1 = on (puts values into %linkz_index, %short_linkz1 and %short_linkz2)
102sub sort_array_by_length
103{
104 local (*foo, $switch) = @_;
105 my $total = $#foo;
106 my %temp_linkz;
107 my $shortest = 999999;
108 my $longest = 0;
109
110 if ($switch != 0)
111 {
112 print "Processing linkz (chopping, slicing, dicing and sorting :-)...";
113 }
114
115 for my $counter(0 .. $total)
116 {
117 if (length($foo[$counter]) < $shortest)
118 {
119 $shortest = length($foo[$counter]);
120 $temp_linkz[$total] = $foo[$counter];
121 }
122 if (length($foo[$counter]) > $longest)
123 {
124 $longest = length($foo[$counter]);
125 }
126 }
127
128 $backward = $total;
129 for my $l($shortest .. $longest)
130 {
131 local $numberdir = 0;
132 for my $counter(0 .. $total)
133 {
134 if ($counter % $dir_entries == 0)
135 {
136 $numberdir = $counter;
137 }
138
139 if(length($foo[$counter]) == $l)
140 {
141 $temp_linkz[$backward] = $foo[$counter];
142 if ($switch != 0)
143 {
144 $linkz_index[$backward] = "../".$numberdir."/".$counter;
145 my $d_offset = 0;
146 for my $search(0 .. (length($foo[$counter]) - 3))
147 {
148 if((substr($foo[$counter], $search, 3) eq '?e=')||(substr($foo[$counter], $search, 3) eq '&e='))
149 {
150 $short_linkz1[$backward] = substr($foo[$counter], 0, $search);
151 }
152
153 for my $second_search($search .. length($foo[$counter]))
154 {
155 if((substr($foo[$counter], $second_search, 3) eq '?d=')||(substr($foo[$counter], $second_search, 3) eq '&d='))
156 {
157 $short_linkz3[$backward] = substr($foo[$counter], $second_search);
158 $d_offset = $second_search;
159 last;
160 }
161 }
162
163 if(substr($foo[$counter], $search, 3) eq '?a=')
164 {
165 $short_linkz1[$backward] = substr($foo[$counter], 0, $search);
166 if($d_offset > 0)
167 {
168 $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
169 }
170 else
171 {
172 $short_linkz2[$backward] = substr($foo[$counter], $search);
173 }
174 }
175
176 if(substr($foo[$counter], $search, 3) eq '&a=')
177 {
178 if($d_offset > 0)
179 {
180 $short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
181 }
182 else
183 {
184 $short_linkz2[$backward] = substr($foo[$counter], $search);
185 }
186 }
187 }
188 }
189 $backward--;
190 }
191 }
192 }
193 # copy the sorted temp_array over the original array (must be a better way of doing this :\ )
194 for my $counter(0 .. $total)
195 {
196 $foo[$counter] = $temp_linkz[$counter];
197 }
198 if ($switch != 0)
199 {
200 print "done!\n";
201 }
202}
203
204sub how_much_to_chop
205{
206 local($link) = @_;
207 my $bracket_counter = 0;
208 my $chop_offset = 0;
209
210 for my $search(0 .. length($link))
211 {
212 if (substr($link, $search, 1) eq '/')
213 {
214 $bracket_counter++;
215 }
216 if ($bracket_counter == 2)
217 {
218 $chop_offset = $search + 1;
219 }
220 }
221 return $chop_offset;
222}
223
224my $start_time = (times)[0];
225
226#-----------------------------------------------------------------------------------------------
227# No need to start from scratch everytime, we can recover/continue from wherever we left off
228# simply by checking which html files have been created
229#-----------------------------------------------------------------------------------------------
230my $linknumber = 0;
231my $failed = 0;
232my $check_file = "";
233my $numberdir = 0;
234
235if($outputdir ne $finaldir)
236{
237 while ($failed == 0)
238 {
239 if ($linknumber % $dir_entries == 0)
240 {
241 if (!((-e $finaldir.$linknumber)&&(-d $finaldir.$linknumber)))
242 {
243 $failed++;
244 mkdir($finaldir.$linknumber, 0777) or die " ** Cannot create ",$finaldir.$linknumber, "!: $!\n";
245 }
246 $numberdir = $linknumber;
247 }
248
249 $check_file = $finaldir.$numberdir."/".$linknumber.".html";
250 if ((-e $check_file)&&($failed == 0))
251 {
252 $linknumber++;
253 }
254 else
255 {
256 $failed++;
257 # I'm subtracting 1 from the starting link,
258 # just in case it only loaded half the page ;^)
259 if($linknumber>0)
260 {
261 $linknumber--;
262 }
263 print " Will start processing at number $linknumber \n";
264 }
265 }
266}
267my $i = 0;
268my $that = "";
269my $offset = 0;
270
271#read in old links from links text file
272open (CHECK, "links.txt") || die " ** Cannot find/open links text file!: $!\n";
273while (defined ($that = <CHECK>)) {
274
275 if ($i == 0)
276 {
277 #chop off the first bit
278 $offset = &how_much_to_chop($that);
279 print " Offset has been set to: ",$offset,"\n";
280 print " This next bit will be ignored for all links in the links.txt file:\n";
281 print " -->",substr($that,0,$offset),"<--\n";
282 }
283
284 $that = substr($that, $offset);
285
286 #Wipe-out the EOL character
287# if (substr($that, -1) eq "\n") { substr($that, -1) = ""; }
288 chomp $that;
289
290 #this wipes the options
291# if (length($option) != 0)
292# {
293# substr($that, (length($option)) * -1) = "";
294# }
295 $that =~ s/$option//;
296
297 $linkz[$i] = $that;
298
299 $short_linkz1[$i] = "";
300 $short_linkz2[$i] = "";
301 $short_linkz3[$i] = "";
302
303 for my $search(0 .. (length($that) - 3))
304 {
305 if((substr($that, $search, 3) eq '?e=')||(substr($that, $search, 3) eq '&e='))
306 {
307 $short_linkz1[$i] = substr($that, 0, $search);
308 }
309
310 if(substr($that, $search, 3) eq '?a=')
311 {
312 $short_linkz1[$i] = substr($that, 0, $search);
313 $short_linkz2[$i] = substr($that, $search);
314 }
315 if(substr($that, $search, 3) eq '&a=')
316 {
317 $short_linkz2[$i] = substr($that, $search);
318 }
319 }
320 $i++;
321
322 if ($i % $dir_entries == 0)
323 {
324 if (!((-e $finaldir.$i)&&(-d $finaldir.$i)))
325 {
326 mkdir($finaldir.$i, 0777) or die " ** Cannot create ",$finaldir.$i, "!: $!\n";
327 }
328 }
329}
330close(CHECK);
331
332print " - I found ",$i, " links in the links text file -\n";
333
334&sort_array_by_length(*linkz, 1);
335
336$numberdir = 0;
337
338for my $z(0 .. ($i - 1))
339{
340 if($z % $dir_entries == 0)
341 {
342 $numberdir = $z;
343 }
344 $filez[$z] = $outputdir.$numberdir."/".$z.".html";
345 $out_filez[$z] = $finaldir.$numberdir."/".$z.".html";
346}
347
348# ..and last but not least, load any image_dirs from image_dirs.txt
349my $imd_that = "";
350#my $image_dirs_pointer = 0;
351
352my @tmp_arr = ();
353open (IMAGE_DIR, "image_dirs.txt") || die " ** HEY! Cannot find/open image_dirs.txt file! : $! **\n";
354while(defined ($imd_that = <IMAGE_DIR>))
355{
356 chomp $imd_that;
357 push(@tmp_array, $imd_that);
358}
359close IMAGE_DIR;
360
361$remove_these = "(" . join ("|", sort {length $b <=> length $a} @tmp_array) . ")";
362
363#print " - I found ",($#remove_these + 1)," picture directories in image_dirs.txt -\n";
364#&sort_array_by_length(*remove_these, 0);
365
366print "-" x 20, "\n";
367print " Here we go...\n";
368print "-" x 20, "\n";
369
370&processfiles($linknumber);
371
372my $end_time = (times)[0];
373print "\n\n\n *----------------------------*\n";
374print " | Whew! Task completed! :-D |\n";
375print " *----------------------------*\n";
376printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
377print "\n\n";
378print " Now there's a few things left to do...load up ",$finaldir, "0/0.html in your webbrowser and\n";
379print " make sure everything works.\n";
380print " The grab_collection script will have generated 3 text files that can be removed, namely:\n";
381print " - links.txt \n";
382print " - images.txt \n";
383print " - image_dirs.txt \n\n";
384if ($outputdir ne $finaldir)
385{
386 print "And then finally you can also delete the ",$outputdir," directory.\n\n";
387}
388
389
390
391
392
393
Note: See TracBrowser for help on using the repository browser.