Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/trunk/greenstone2/bin/script/process_html.pl@ 24375

Last change on this file since 24375 was 2668, checked in by sjboddie, 23 years ago
Added Marcel's static collection building scripts to the source tree. These aren't really expected to be any use to anyone yet, they're included mostly so that I don't lose them.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 10.3 KB

Line
1	#!/usr/bin/perl -w
2
3
4	# Both this script and its associated process_html.pl were written by
5	# Marcel ?, while a student at Waikato University. Unfortunately he
6	# was very new to perl at the time so the code is neither as clean nor
7	# as fast as it could be (I've cleaned up a few of the more serious
8	# bottlenecks -- it could do with alot more work though). The code
9	# does work though, if a little slowly. It's not ready for primetime
10	# however and is included in the Greenstone source tree mostly so that
11	# I don't lose it. -- Stefan - 24 Jul 2001
12
13
14	# This script rebuilds the static collection by linking all the downloaded html files
15	# back together.
16	# It searches through html files and replaces links it recognizes from the links.txt file
17	# with the apropriate html file name (eg 1.html, 2.html etc)
18	# This script also updates the links for the pictures.
19
20	# This is where all the dl-ed html files are located, eg. 'temp_html/'
21	my $outputdir = 'temp_html/';
22
23	# This is where all the processed files end up (don't want to overwrite originals ;) eg. 'my_static_collection/'
24	my $finaldir = 'envl_collection/';
25
26	# If any options where used (such as &u=1) when the html files where dl-ed then please specify them here.
27	my $option = "&u=1";
28
29	# Please ensure these two options match the settings used when downloading the collection :)
30	my $dir_entries = 250;
31	my $fix_empty_pages = '&cl=CL1';
32
33	#-------------------------------------------------------------------------------------------------
34
35	# global arrays used to store links, links-index & html-filenames
36	my %filez;
37	my %out_filez;
38	my %linkz;
39	my %short_linkz1;
40	my %short_linkz2;
41	my %short_linkz3;
42	my %linkz_index;
43	#my %remove_these;
44	my $remove_these = "";
45
46	sub processfiles
47	{
48	local($start_here) = @_;
49
50	for my $file($start_here .. $#filez)
51	{
52	if((-e $filez[$file])&&(-s $filez[$file]))
53	{
54	open (FILE, $filez[$file]) or die "can't open ", $filez[$file],": $! \n";
55
56	print " $filez[$file] ";
57
58	undef $/;
59	my $content_of_file = <FILE>;
60	$/ = "\n";
61	close(FILE);
62
63	#quick & nasty fix for the 'open book' link
64	local $quick_fix1 = "&cl=\"";
65	local $quick_fix2 = "&cl=\'";
66
67	$content_of_file =~ s/$quick_fix1/$fix_empty_pages\"/g;
68	$content_of_file =~ s/$quick_fix2/$fix_empty_pages\'/g;
69
70	for my $link(0 .. $#linkz)
71	{
72	my $new_link = $linkz_index[$link].".html";
73
74	if($short_linkz3[$link] ne "")
75	{
76	$content_of_file =~ s/$short_linkz1[$link].?$short_linkz2[$link].?${short_linkz3[$link]}[^\"\'\s\>]*/$new_link/g;
77	}
78	else
79	{
80	$content_of_file =~ s/$short_linkz1[$link].${short_linkz2[$link]}[^\"\'\s\>]/$new_link/g;
81	}
82	}
83
84	$content_of_file =~ s/(["'])$remove_these/$1..\//g;
85	open (TEMP, ">temp.html") or die "can't open temp.html: $! \n";
86	print TEMP $content_of_file;
87	close(TEMP);
88	rename("temp.html", $out_filez[$file]) or die "cannot create", $out_filez[$file],": $! \n";
89	print " --> $out_filez[$file]";
90	print "..done\n";
91	}
92	else
93	{
94	last; # bomb out of loop. Done.
95	}
96	}
97	print " * Done, cannot find any more files to process *\n";
98	}
99
100	# the switch variable there so that I can create a couple of additional arrays without having to write an entirely new function :-)
101	# 0 = off, 1 = on (puts values into %linkz_index, %short_linkz1 and %short_linkz2)
102	sub sort_array_by_length
103	{
104	local (*foo, $switch) = @_;
105	my $total = $#foo;
106	my %temp_linkz;
107	my $shortest = 999999;
108	my $longest = 0;
109
110	if ($switch != 0)
111	{
112	print "Processing linkz (chopping, slicing, dicing and sorting :-)...";
113	}
114
115	for my $counter(0 .. $total)
116	{
117	if (length($foo[$counter]) < $shortest)
118	{
119	$shortest = length($foo[$counter]);
120	$temp_linkz[$total] = $foo[$counter];
121	}
122	if (length($foo[$counter]) > $longest)
123	{
124	$longest = length($foo[$counter]);
125	}
126	}
127
128	$backward = $total;
129	for my $l($shortest .. $longest)
130	{
131	local $numberdir = 0;
132	for my $counter(0 .. $total)
133	{
134	if ($counter % $dir_entries == 0)
135	{
136	$numberdir = $counter;
137	}
138
139	if(length($foo[$counter]) == $l)
140	{
141	$temp_linkz[$backward] = $foo[$counter];
142	if ($switch != 0)
143	{
144	$linkz_index[$backward] = "../".$numberdir."/".$counter;
145	my $d_offset = 0;
146	for my $search(0 .. (length($foo[$counter]) - 3))
147	{
148	if((substr($foo[$counter], $search, 3) eq '?e=')\|\|(substr($foo[$counter], $search, 3) eq '&e='))
149	{
150	$short_linkz1[$backward] = substr($foo[$counter], 0, $search);
151	}
152
153	for my $second_search($search .. length($foo[$counter]))
154	{
155	if((substr($foo[$counter], $second_search, 3) eq '?d=')\|\|(substr($foo[$counter], $second_search, 3) eq '&d='))
156	{
157	$short_linkz3[$backward] = substr($foo[$counter], $second_search);
158	$d_offset = $second_search;
159	last;
160	}
161	}
162
163	if(substr($foo[$counter], $search, 3) eq '?a=')
164	{
165	$short_linkz1[$backward] = substr($foo[$counter], 0, $search);
166	if($d_offset > 0)
167	{
168	$short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
169	}
170	else
171	{
172	$short_linkz2[$backward] = substr($foo[$counter], $search);
173	}
174	}
175
176	if(substr($foo[$counter], $search, 3) eq '&a=')
177	{
178	if($d_offset > 0)
179	{
180	$short_linkz2[$backward] = substr($foo[$counter], $search, $d_offset - $search);
181	}
182	else
183	{
184	$short_linkz2[$backward] = substr($foo[$counter], $search);
185	}
186	}
187	}
188	}
189	$backward--;
190	}
191	}
192	}
193	# copy the sorted temp_array over the original array (must be a better way of doing this :\ )
194	for my $counter(0 .. $total)
195	{
196	$foo[$counter] = $temp_linkz[$counter];
197	}
198	if ($switch != 0)
199	{
200	print "done!\n";
201	}
202	}
203
204	sub how_much_to_chop
205	{
206	local($link) = @_;
207	my $bracket_counter = 0;
208	my $chop_offset = 0;
209
210	for my $search(0 .. length($link))
211	{
212	if (substr($link, $search, 1) eq '/')
213	{
214	$bracket_counter++;
215	}
216	if ($bracket_counter == 2)
217	{
218	$chop_offset = $search + 1;
219	}
220	}
221	return $chop_offset;
222	}
223
224	my $start_time = (times)[0];
225
226	#-----------------------------------------------------------------------------------------------
227	# No need to start from scratch everytime, we can recover/continue from wherever we left off
228	# simply by checking which html files have been created
229	#-----------------------------------------------------------------------------------------------
230	my $linknumber = 0;
231	my $failed = 0;
232	my $check_file = "";
233	my $numberdir = 0;
234
235	if($outputdir ne $finaldir)
236	{
237	while ($failed == 0)
238	{
239	if ($linknumber % $dir_entries == 0)
240	{
241	if (!((-e $finaldir.$linknumber)&&(-d $finaldir.$linknumber)))
242	{
243	$failed++;
244	mkdir($finaldir.$linknumber, 0777) or die " ** Cannot create ",$finaldir.$linknumber, "!: $!\n";
245	}
246	$numberdir = $linknumber;
247	}
248
249	$check_file = $finaldir.$numberdir."/".$linknumber.".html";
250	if ((-e $check_file)&&($failed == 0))
251	{
252	$linknumber++;
253	}
254	else
255	{
256	$failed++;
257	# I'm subtracting 1 from the starting link,
258	# just in case it only loaded half the page ;^)
259	if($linknumber>0)
260	{
261	$linknumber--;
262	}
263	print " Will start processing at number $linknumber \n";
264	}
265	}
266	}
267	my $i = 0;
268	my $that = "";
269	my $offset = 0;
270
271	#read in old links from links text file
272	open (CHECK, "links.txt") \|\| die " ** Cannot find/open links text file!: $!\n";
273	while (defined ($that = <CHECK>)) {
274
275	if ($i == 0)
276	{
277	#chop off the first bit
278	$offset = &how_much_to_chop($that);
279	print " Offset has been set to: ",$offset,"\n";
280	print " This next bit will be ignored for all links in the links.txt file:\n";
281	print " -->",substr($that,0,$offset),"<--\n";
282	}
283
284	$that = substr($that, $offset);
285
286	#Wipe-out the EOL character
287	# if (substr($that, -1) eq "\n") { substr($that, -1) = ""; }
288	chomp $that;
289
290	#this wipes the options
291	# if (length($option) != 0)
292	# {
293	# substr($that, (length($option)) * -1) = "";
294	# }
295	$that =~ s/$option//;
296
297	$linkz[$i] = $that;
298
299	$short_linkz1[$i] = "";
300	$short_linkz2[$i] = "";
301	$short_linkz3[$i] = "";
302
303	for my $search(0 .. (length($that) - 3))
304	{
305	if((substr($that, $search, 3) eq '?e=')\|\|(substr($that, $search, 3) eq '&e='))
306	{
307	$short_linkz1[$i] = substr($that, 0, $search);
308	}
309
310	if(substr($that, $search, 3) eq '?a=')
311	{
312	$short_linkz1[$i] = substr($that, 0, $search);
313	$short_linkz2[$i] = substr($that, $search);
314	}
315	if(substr($that, $search, 3) eq '&a=')
316	{
317	$short_linkz2[$i] = substr($that, $search);
318	}
319	}
320	$i++;
321
322	if ($i % $dir_entries == 0)
323	{
324	if (!((-e $finaldir.$i)&&(-d $finaldir.$i)))
325	{
326	mkdir($finaldir.$i, 0777) or die " ** Cannot create ",$finaldir.$i, "!: $!\n";
327	}
328	}
329	}
330	close(CHECK);
331
332	print " - I found ",$i, " links in the links text file -\n";
333
334	&sort_array_by_length(*linkz, 1);
335
336	$numberdir = 0;
337
338	for my $z(0 .. ($i - 1))
339	{
340	if($z % $dir_entries == 0)
341	{
342	$numberdir = $z;
343	}
344	$filez[$z] = $outputdir.$numberdir."/".$z.".html";
345	$out_filez[$z] = $finaldir.$numberdir."/".$z.".html";
346	}
347
348	# ..and last but not least, load any image_dirs from image_dirs.txt
349	my $imd_that = "";
350	#my $image_dirs_pointer = 0;
351
352	my @tmp_arr = ();
353	open (IMAGE_DIR, "image_dirs.txt") \|\| die " HEY! Cannot find/open image_dirs.txt file! : $! \n";
354	while(defined ($imd_that = <IMAGE_DIR>))
355	{
356	chomp $imd_that;
357	push(@tmp_array, $imd_that);
358	}
359	close IMAGE_DIR;
360
361	$remove_these = "(" . join ("\|", sort {length $b <=> length $a} @tmp_array) . ")";
362
363	#print " - I found ",($#remove_these + 1)," picture directories in image_dirs.txt -\n";
364	#&sort_array_by_length(*remove_these, 0);
365
366	print "-" x 20, "\n";
367	print " Here we go...\n";
368	print "-" x 20, "\n";
369
370	&processfiles($linknumber);
371
372	my $end_time = (times)[0];
373	print "\n\n\n ----------------------------\n";
374	print " \| Whew! Task completed! :-D \|\n";
375	print " ----------------------------\n";
376	printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
377	print "\n\n";
378	print " Now there's a few things left to do...load up ",$finaldir, "0/0.html in your webbrowser and\n";
379	print " make sure everything works.\n";
380	print " The grab_collection script will have generated 3 text files that can be removed, namely:\n";
381	print " - links.txt \n";
382	print " - images.txt \n";
383	print " - image_dirs.txt \n\n";
384	if ($outputdir ne $finaldir)
385	{
386	print "And then finally you can also delete the ",$outputdir," directory.\n\n";
387	}
388
389
390
391
392
393

Note: See TracBrowser for help on using the repository browser.

Download in other formats: