Context Navigation

source: gsdl/trunk/bin/script/grab_collection.pl@ 18470

Last change on this file since 18470 was 2671, checked in by sjboddie, 23 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.7 KB

Line
1	#!/usr/bin/perl -w
2
3
4	# Both this script and its associated process_html.pl were written by
5	# Marcel ?, while a student at Waikato University. Unfortunately he
6	# was very new to perl at the time so the code is neither as clean nor
7	# as fast as it could be (I've cleaned up a few of the more serious
8	# bottlenecks -- it could do with alot more work though). The code
9	# does work though, if a little slowly. It's not ready for primetime
10	# however and is included in the Greenstone source tree mostly so that
11	# I don't lose it. -- Stefan - 24 Jul 2001
12
13
14	# This script will download an entire collection (and its associated pictures and files)
15	# and store them in a temporary directory ($outputdir).
16	# A second script (process_html.pl) can be then be used to 'rebuild' the collection and link all the
17	# downloaded pages and pictures together into a usable static collection.
18
19	# This script will generate a number of text files required by the second script
20	# and for possible recovery, namely:
21	# - links.txt - contains all the http links that were downloaded
22	# - images.txt - contains a list of all the images that were downloaded
23	# - image_dirs.txt - contains a list of all the image-prefixes that need to be wiped from the html files
24	# (otherwise you won't get to see any pictures)
25
26	# Both this script and the html processing script have a recovery feature built in, they can continue from wherever
27	# they left off, but this only works if $outputdir and $finaldir are set to different values.
28
29	# This is where all the downloaded html files end up, eg. 'temp_html/'
30	my $outputdir = 'temp_html/';
31	# This is where all the processed html files (and the pictures) will end up eg. 'my_static_collection/'
32	my $finaldir = 'envl_collection/';
33
34	# This is where we start our mirroring
35	$address = 'http://nowhere.com/cgi-bin/library?a=p&p=about&c=demo&u=1';
36
37
38	# whatever is specified in $option will be attached to the end of each html link before it is downloaded.
39	# eg. "&u=1" to disable various features in the greenstone collections that are not needed with a static
40	# collection.
41	# another example : "&l=nl" to set the entire collection to dutch (NetherLands :)
42	my $option = "&u=1";
43
44	# Most OS have a limit on the maximum amount of files per directory (or folder).
45	# A static collection can easily contain >3000 html files. Putting all those files
46	# into one single directory is just asking for trouble. It's also very unwieldy ;^)
47	# Hence...this value will set how much html files will be stored in one directory.
48	# These directories themselves will be numbered,
49	# so if $dir_entries = 500 then the directories will be "0/", "500/", "1000/", "1500/", "2000/", etc.
50	my $dir_entries = 250;
51
52	# Occasionally a page occurs which contains no data (because &cl is not set) This option fixes that.
53	my $fix_empty_pages = "&cl=CL1";
54
55	# These are the files that wget will download.
56	# more can be added if necessary.
57	my @graphic_formats = ('.gif','.jpg','.bmp','.png','.pdf','.mov','.mpeg','.jpeg','.rm');
58
59	# ---------------------------------------[ System specific options ]----------------------------------------------------
60
61	# The lynx variable specifies the command line for the lynx web browser
62	# -- This is what I use under dos/win32
63	# my $lynx = 'e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg';
64
65	# -- This is what I use under linux
66	my $lynx = 'lynx';
67
68	# and the same for the wget utility
69	my $wget = 'wget';
70
71	# NB: There is one other linux specific command all the way at the end of this script, where I've used 'cp' to copy a file.
72
73	# Another NB: When saving the dl-ed html files to disk, I've set lynx to dump the html-source to the standard output,
74	# which I then simply redirect to a target file, BUT
75	# this does not work under DOS/win32. Redirecting standard output in a script causes it to be displayed on
76	# the screen instead. The easiest way to get around this I found was by doing the actual redirection in a simple
77	# batch file (say grab.bat), which contains the following line:
78	# @e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg -dump -source "%1" > %2
79	#
80	# Then replace line nr 326 -> 'system ("$kommand > $target");' with 'system("grab.bat $address $target");'
81	# Not a very elegant solution, but it works :)
82
83	#------------------------------------------------------------------------------------------------------------------------
84
85	my %image_list;
86	my $image_pointer = 0;
87
88	my %linkz_list;
89	my %short_linkz_list;
90	my $linkz_pointer = 0;
91
92	my %image_dirs_list;
93	my $image_dirs_pointer = 0;
94
95	my $numberdir = 0;
96
97	my $start_time = (times)[0];
98
99	# check if directories exist and create them if necessary..
100	if ((-e $outputdir)&&(-d $outputdir))
101	{
102	print " ** ",$outputdir," directory already exists..\n";
103	}
104	else
105	{
106	print " ** Creating ",$outputdir," directory..\n";
107	mkdir($outputdir, 0777) or die " Cannot create output directory: $!\n";
108	}
109
110	if ((-e $finaldir)&&(-d $finaldir))
111	{
112	print " ** ",$finaldir," directory already exists..\n";
113	}
114	else
115	{
116	print " ** Creating ",$finaldir," directory..\n";
117	mkdir($finaldir, 0777) or die " Cannot create final directory: $!\n";
118	}
119
120	#-----------------------------------------------------------------------------------------------
121	# No need to start from scratch everytime, we can recover/continue from wherever we left off
122	# simply by checking which html files have been created
123	#-----------------------------------------------------------------------------------------------
124
125	$linknumber = 0; # used to name/number the dl-ed html files
126
127	my $failed = 0;
128	while ($failed == 0)
129	{
130	if ($linknumber % $dir_entries == 0)
131	{
132	if (!((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber)))
133	{
134	$failed++;
135	mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
136	}
137	$numberdir = $linknumber;
138	}
139
140	$check_file = $outputdir.$numberdir."/".$linknumber.".html";
141	if ((-e $check_file)&&($failed == 0))
142	{
143	$linknumber++;
144	}
145	else
146	{
147	$failed++;
148	# I'm subtracting 1 from the starting link,
149	# just in case it only loaded half the page ;^)
150	if($linknumber>0)
151	{
152	$linknumber--;
153	}
154	print " Will start downloading at number $linknumber \n";
155	}
156	}
157
158	# if we're starting from scratch, then we might as well nuke the links file
159	#if ($linknumber == 0)
160	#{
161	# print " Starting from scratch - clobbering the old text files...\n";
162	# if (-e 'links.txt')
163	# {
164	# print " Removing links.txt...\n";
165	# unlink <links.txt> or print " ** Cannot delete links textfile: $!\n";
166	# }
167	# if (-e 'images.txt')
168	# {
169	# print " Removing images.txt...\n";
170	# unlink <images.txt> or print " ** Cannot delete images textfile: $!\n";
171	# }
172	# if (-e 'image_dirs.txt')
173	# {
174	# print " Removing image_dirs.txt...\n";
175	# unlink <image_dirs.txt> or print " ** Cannot delete image_dirs textfile: $!\n";
176	# }
177	#}
178
179	# if we're NOT starting from scratch, then read in old links from links text file
180	# and grab the old image-links as well...
181	if ($linknumber != 0)
182	{
183	# load the old links from links.txt, if it doesn't exist, then give up :(
184	my $this = "";
185	my $that = "";
186	open (CHECK, "links.txt") or die " Cannot find/open links.txt file!: $! \n";
187	while(eof CHECK == 0)
188	{
189	while($this ne "\n")
190	{
191	read CHECK, $this ,1;
192	$that = $that.$this;
193	}
194	$linkz_list[$linkz_pointer] = $that;
195
196	for my $search(0 .. (length($that) - 3))
197	{
198	if((substr($that, $search, 3) eq '?a=')\|\|(substr($that, $search, 3) eq '&a='))
199	{
200	$short_linkz_list[$linkz_pointer] = substr($that, $search);
201	last;
202	}
203	}
204	$linkz_pointer++;
205	$that = ""; $this = "";
206	}
207	close(CHECK);
208	print "- I found ",($#linkz_list + 1)," links in links.txt -\n";
209
210	#make sure that we start dl-ing the correct first page
211	$address = $linkz_list[$linknumber];
212
213	# load the old image links from image.txt (if it doesn't exist, no big deal ;)
214	my $im_this = "";
215	my $im_that = "";
216	open (IMAGES, "images.txt") \|\| print " Cannot find/open images.txt file! : $! \n";
217	while(eof IMAGES == 0)
218	{
219	while($im_this ne "\n")
220	{
221	read IMAGES, $im_this ,1;
222	$im_that = $im_that.$im_this;
223	}
224	$image_list[$image_pointer] = $im_that;
225	$image_pointer++;
226	$im_that = ""; $im_this = "";
227	}
228	close(IMAGES);
229	print "- I found ",($#image_list + 1)," picture-links in images.txt -\n";
230
231	#..and last but not least, load any image_dirs from image_dirs.txt
232	# again, if its not there, no big deal :)
233	my $imd_this = "";
234	my $imd_that = "";
235	open (IMAGE_DIR, "image_dirs.txt") \|\| print " Cannot find/open image_dirs.txt file!: $! \n";
236	while(eof IMAGE_DIR == 0)
237	{
238	while($imd_this ne "\n")
239	{
240	read IMAGE_DIR, $imd_this ,1;
241	$imd_that = $imd_that.$imd_this;
242	}
243	$image_dirs_list[$image_dirs_pointer] = $imd_that;
244	$image_dirs_pointer++;
245	$imd_that = ""; $imd_this = "";
246	}
247	close(IMAGE_DIR);
248	print "- I found ",($#image_dirs_list + 1)," picture directories in image_dirs.txt -\n";
249	}
250
251	# Just keep going till we can find no more new links
252	while(($#linkz_list < 0)\|\|($#linkz_list+1 > $linknumber))
253	{
254
255	# This line specifies the command line for the lynx web browser
256	my $kommand = $lynx.' -dump -image_links "'.$address.'"';
257
258	# dump page into text-array and find starting-point of the references/links
259	chomp(@data=`$kommand`);
260	for my $i(0 .. $#data)
261	{
262	if ($data[$i] eq "References") {
263	$here = $i;}
264	}
265	$here = $here+2;
266
267	# process references/links
268	for $i($here .. $#data){
269
270	$its_an_image = 0;
271
272	#chop-off refs leading number&spaces (eg. '1. http://www.cs.waikato.ac.nz')
273	# ^^^
274
275	# $temp = substr($data[$i],3);
276	# @temp = split(/ /, $temp, 2);
277
278	#check if the last 4 characters of the link equal .gif .jpg .png .bmp .pdf .mov .mpeg etc etc
279
280	# for my $g(0 .. $#graphic_formats)
281	# {
282	# if(substr($temp[1],(length($graphic_formats[$g]) * -1)) eq $graphic_formats[$g])
283	# {
284	# $its_an_image = 1;
285	# }
286	# }
287
288	$data[$i] =~ s/^\s*\d+\.\s+//;
289	if ($data[$i] =~ /\.(gif\|jpe?g\|png\|bmp\|pdf\|mov\|mpe?g\|rm)$/i) {
290	$its_an_image = 1;
291	}
292
293	# ignore mailto urls
294	if ($data[$i] !~ /mailto:/i) {
295
296	#----------- the link is NOT an image ----------------
297	if ($its_an_image == 0)
298	{
299	&its_a_link($data[$i], $outputdir);
300	}
301
302	#----------- the link IS an image ----------------
303	if ($its_an_image != 0)
304	{
305	&its_an_image($data[$i], $finaldir);
306	}
307	}
308	}
309
310	# save the web page to disk (in the appropriate numbered directory)
311	$kommand = $lynx.' -dump -source "'.$address.'"';
312
313	if ($linknumber % $dir_entries == 0)
314	{
315	if ((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber))
316	{
317	print " ** ",$outputdir.$linknumber, " - Directory allready exists.\n";
318	}
319	else
320	{
321	mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
322	mkdir($finaldir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
323	}
324	$numberdir = $linknumber;
325	}
326	my $target = $outputdir.$numberdir."/".$linknumber.".html";
327
328	#---------------------------------------------------------------------------------------------------------------
329	# NOTE: This next command will NOT work under win32/dos, as redirecting standard output in a script causes it to
330	# be dumped straight to the screen as opposed to into the target file.
331	#---------------------------------------------------------------------------------------------------------------
332	system ("$kommand > $target");
333	#---------------------------------------------------------------------------------------------------------------
334
335	print " Saved $target\n";
336
337	$linknumber++;
338
339	$address = $linkz_list[$linknumber];
340	}
341
342	my $end_time = (times)[0];
343
344	print "\n\n\n ----------------------------\n";
345	print " \| Whew! Task completed! :-D \|\n";
346	print " ----------------------------\n";
347	printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
348	print "\n\n";
349	print " Now execute the process_html.pl script to link the downloaded collection together.\n";
350	print " Please do make sure that it is executed with the same options as this script ;-)\n";
351
352	sub its_a_link
353	{
354	local($found) = @_;
355	# local($ok = 0, $kommand);
356	local($kommand);
357	local $short_link = "";
358
359	return if ($found =~ /\#.*$/);
360
361	# attach the custom options
362	$found .= $option;
363
364	#little bit of trickery here - check if there is a &d= option present in the link
365	#if there is, then wipe the &cl= option!
366	#This should cut down multiple copies by 75%!!
367
368	#but, if there is no &d option, and the &cl option is not set, then we have to set the &cl option to something
369	#otherwise we get pages which contain no data :\
370
371	if ($found =~ /[&\?]a=d/) {
372	if ($found =~ /[&\?]d=/) {
373	$found =~ s/[&\?]cl=[^&]*//;
374	} elsif ($found !~ /[&\?]cl=/) {
375	$found .= $fix_empty_pages;
376	}
377	}
378
379	# we also want to sort out any xxx.pr OIDs that we come across
380	$found =~ s/([&\?](cl\|d)=.*?)\.\d+\.pr/$1/g;
381
382	# attach the EOL character.
383	$found = $found."\n";
384
385
386	# the hard way !!!
387	# for my $search(0 .. (length($found) - 3))
388	# {
389	# if((substr($found, $search, 3) eq '?d=')\|\|(substr($found, $search, 3) eq '&d='))
390	# {
391	# for my $second_search(0 .. (length($found) - 4))
392	# {
393	# if((substr($found, $second_search, 4) eq '?cl=')\|\|(substr($found, $second_search, 4) eq '&cl='))
394	# {
395	# for my $third_search(($second_search + 3) .. (length($found) - 1))
396	# {
397	# if((substr($found, $third_search, 1)) eq '&')
398	# {
399	# substr($found, $second_search, $third_search - $second_search) = "";
400	# last;
401	# }
402	# }
403	# last;
404	# }
405	# }
406	# last;
407	# }
408	# else
409	# {
410	# if( $search == (length($found) - 3))
411	# {
412	# for my $second_search(0 .. (length($found) - 4))
413	# {
414	# if((substr($found, $second_search, 4) eq '?cl=')\|\|(substr($found, $second_search, 4) eq '&cl='))
415	# {
416	# for my $third_search(($second_search + 3) .. (length($found) - 1))
417	# {
418	# if((substr($found, $third_search, 1)) eq '&')
419	# {
420	# if (substr($found, $second_search, $third_search - $second_search) eq '&cl=')
421	# {
422	# substr($found, $second_search, $third_search - $second_search) = $fix_empty_pages;
423	# }
424	# last;
425	# }
426	# }
427	# last;
428	# }
429	# }
430	# }
431	# }
432	# }
433
434	# grab the last part of the link (ignoring the start and the &e option)
435	# for my $search(0 .. (length($found) - 3))
436	# {
437	# if((substr($found, $search, 3) eq '?a=')\|\|(substr($found, $search, 3) eq '&a='))
438	# {
439	# $short_link = substr($found, $search);
440	# last;
441	# }
442	# }
443
444	($short_link) = $found =~ /\?(.*)$/;
445	$short_link =~ s/(^\|&)e=[^&]*/$1/;
446
447
448	# this filters out multiple copies of for example the help page, which has #something at the end of its links
449	# now do this first with regular expression above -- Stefan
450
451	# for my $search(0 .. length($found))
452	# {
453	# if ((substr($found, $search, 1)) eq '#')
454	# {
455	# $ok++;
456	# last;
457	# }
458	# }
459
460
461
462	# compare the found link to the links we've stored in the arrays (compares both full link and partial link)
463	for my $search(0 .. $#linkz_list)
464	{
465	return if ($found eq $linkz_list[$search]);
466	return if ($short_link eq $short_linkz_list[$search]);
467	}
468
469	# if found link is not in links array, add it
470	open (DUMP, ">>links.txt") or die " ** Can't open links.txt!: $!\n";
471	print DUMP $found;
472	close(DUMP);
473
474	$linkz_list[$linkz_pointer] = $found;
475	$short_linkz_list[$linkz_pointer] = $short_link;
476	$linkz_pointer++;
477	}
478
479	sub do_image_dirs
480	{
481	local($found) = @_;
482	my $count = 0;
483	my @br_index;
484	my $image_dir = "";
485	my $new_dir = 0;
486
487	for my $search(1 .. (length($found) - 1 ))
488	{
489	$bracket = substr($found, ($search * - 1), 1);
490	if ($bracket eq '/')
491	{
492	$count++;
493	$br_index[$count] = $search;
494	}
495	if($count == 2)
496	{
497	$image_dir = substr($found, ($br_index[2] * -1) , ($br_index[2] - $br_index[1]));
498	}
499	}
500
501	my $dirs_to_wipe = substr($found, $br_index[$#br_index - 2] * - 1, $br_index[$#br_index - 2] - $br_index[2] + 1)."\n";
502
503	for my $counter(0 .. $#image_dirs_list)
504	{
505	if($dirs_to_wipe eq $image_dirs_list[$counter])
506	{
507	$new_dir++;
508	}
509	}
510
511	if ($new_dir == 0)
512	{
513	open (IMAGE_DIRS, ">>image_dirs.txt") or die " ** Can't open image_dirs.txt!: $!\n";
514	print IMAGE_DIRS $dirs_to_wipe;
515	close(IMAGE_DIRS);
516	$image_dirs_list[$image_dirs_pointer] = $dirs_to_wipe;
517	$image_dirs_pointer++;
518	}
519
520	print " ",substr($finaldir, 0 ,length($finaldir) - 1).$image_dir.substr($found, ($br_index[1] * - 1), length($found) - (length($found) - $br_index[1])),"\n";
521
522	return $image_dir;
523	}
524
525	sub its_an_image
526	{
527	local($found, $outpdir) = @_;
528	local($kommand);
529	my $new = 0;
530
531	my $temp_found = $found . "\n";
532
533	# check if the image is in the list
534	for my $counter(0 .. $#image_list)
535	{
536	if($temp_found eq $image_list[$counter])
537	{
538	$new++;
539	}
540	}
541
542	# only download the image if its not in the list..
543	if($new == 0)
544	{
545	my $image_dir = &do_image_dirs;
546	my $temp_outputdir = $outpdir;
547	if (substr($temp_outputdir, -1, 1) eq "/")
548	{
549	substr($temp_outputdir, -1, 1) = "";
550	}
551
552	# wget is set to 'q - quiet' and 'nc - dont clobber existing files'
553	$kommand = $wget.' -qnc --directory-prefix='.$temp_outputdir.$image_dir.' "'.$found.'"';
554	system ("$kommand");
555
556	open (IMAGES, ">>images.txt") or die " ** Can't open images.txt!: $!\n";
557	print IMAGES $temp_found;
558	close(IMAGES);
559
560	$image_list[$image_pointer] = $temp_found;
561	$image_pointer++;
562
563	# grab corresponding ON pictures for navigation bar if we've just dl-ed the OFF picture
564	if(substr($found , -6) eq "of.gif")
565	{
566	substr($found, -6, 6) = "on.gif";
567	&its_an_image($found, $outpdir);
568	}
569	}
570	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: