Context Navigation

grab_collection.pl@ 32130

Last change on this file since 32130 was 28560, checked in by ak19, 11 years ago
New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.0 KB

Line
1	#!/usr/bin/perl -w
2
3	use util;
4
5	# Both this script and its associated process_html.pl were written by
6	# Marcel ?, while a student at Waikato University. Unfortunately he
7	# was very new to perl at the time so the code is neither as clean nor
8	# as fast as it could be (I've cleaned up a few of the more serious
9	# bottlenecks -- it could do with alot more work though). The code
10	# does work though, if a little slowly. It's not ready for primetime
11	# however and is included in the Greenstone source tree mostly so that
12	# I don't lose it. -- Stefan - 24 Jul 2001
13
14
15	# This script will download an entire collection (and its associated pictures and files)
16	# and store them in a temporary directory ($outputdir).
17	# A second script (process_html.pl) can be then be used to 'rebuild' the collection and link all the
18	# downloaded pages and pictures together into a usable static collection.
19
20	# This script will generate a number of text files required by the second script
21	# and for possible recovery, namely:
22	# - links.txt - contains all the http links that were downloaded
23	# - images.txt - contains a list of all the images that were downloaded
24	# - image_dirs.txt - contains a list of all the image-prefixes that need to be wiped from the html files
25	# (otherwise you won't get to see any pictures)
26
27	# Both this script and the html processing script have a recovery feature built in, they can continue from wherever
28	# they left off, but this only works if $outputdir and $finaldir are set to different values.
29
30	# This is where all the downloaded html files end up, eg. 'temp_html/'
31	my $outputdir = 'temp_html/';
32	# This is where all the processed html files (and the pictures) will end up eg. 'my_static_collection/'
33	my $finaldir = 'envl_collection/';
34
35	# This is where we start our mirroring
36	$address = 'http://nowhere.com/cgi-bin/library?a=p&p=about&c=demo&u=1';
37
38
39	# whatever is specified in $option will be attached to the end of each html link before it is downloaded.
40	# eg. "&u=1" to disable various features in the greenstone collections that are not needed with a static
41	# collection.
42	# another example : "&l=nl" to set the entire collection to dutch (NetherLands :)
43	my $option = "&u=1";
44
45	# Most OS have a limit on the maximum amount of files per directory (or folder).
46	# A static collection can easily contain >3000 html files. Putting all those files
47	# into one single directory is just asking for trouble. It's also very unwieldy ;^)
48	# Hence...this value will set how much html files will be stored in one directory.
49	# These directories themselves will be numbered,
50	# so if $dir_entries = 500 then the directories will be "0/", "500/", "1000/", "1500/", "2000/", etc.
51	my $dir_entries = 250;
52
53	# Occasionally a page occurs which contains no data (because &cl is not set) This option fixes that.
54	my $fix_empty_pages = "&cl=CL1";
55
56	# These are the files that wget will download.
57	# more can be added if necessary.
58	my @graphic_formats = ('.gif','.jpg','.bmp','.png','.pdf','.mov','.mpeg','.jpeg','.rm');
59
60	# ---------------------------------------[ System specific options ]----------------------------------------------------
61
62	# The lynx variable specifies the command line for the lynx web browser
63	# -- This is what I use under dos/win32
64	# my $lynx = 'e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg';
65
66	# -- This is what I use under linux
67	my $lynx = 'lynx';
68
69	# and the same for the wget utility
70	my $wget = 'wget';
71
72	# NB: There is one other linux specific command all the way at the end of this script, where I've used 'cp' to copy a file.
73
74	# Another NB: When saving the dl-ed html files to disk, I've set lynx to dump the html-source to the standard output,
75	# which I then simply redirect to a target file, BUT
76	# this does not work under DOS/win32. Redirecting standard output in a script causes it to be displayed on
77	# the screen instead. The easiest way to get around this I found was by doing the actual redirection in a simple
78	# batch file (say grab.bat), which contains the following line:
79	# @e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg -dump -source "%1" > %2
80	#
81	# Then replace line nr 326 -> 'system ("$kommand > $target");' with 'system("grab.bat $address $target");'
82	# Not a very elegant solution, but it works :)
83
84	#------------------------------------------------------------------------------------------------------------------------
85
86	my %image_list;
87	my $image_pointer = 0;
88
89	my %linkz_list;
90	my %short_linkz_list;
91	my $linkz_pointer = 0;
92
93	my %image_dirs_list;
94	my $image_dirs_pointer = 0;
95
96	my $numberdir = 0;
97
98	my $start_time = (times)[0];
99
100	# check if directories exist and create them if necessary..
101	if ((-e $outputdir)&&(-d $outputdir))
102	{
103	print " ** ",$outputdir," directory already exists..\n";
104	}
105	else
106	{
107	print " ** Creating ",$outputdir," directory..\n";
108	mkdir($outputdir, 0777) or die " Cannot create output directory: $!\n";
109	}
110
111	if ((-e $finaldir)&&(-d $finaldir))
112	{
113	print " ** ",$finaldir," directory already exists..\n";
114	}
115	else
116	{
117	print " ** Creating ",$finaldir," directory..\n";
118	mkdir($finaldir, 0777) or die " Cannot create final directory: $!\n";
119	}
120
121	#-----------------------------------------------------------------------------------------------
122	# No need to start from scratch everytime, we can recover/continue from wherever we left off
123	# simply by checking which html files have been created
124	#-----------------------------------------------------------------------------------------------
125
126	$linknumber = 0; # used to name/number the dl-ed html files
127
128	my $failed = 0;
129	while ($failed == 0)
130	{
131	if ($linknumber % $dir_entries == 0)
132	{
133	if (!((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber)))
134	{
135	$failed++;
136	mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
137	}
138	$numberdir = $linknumber;
139	}
140
141	$check_file = $outputdir.$numberdir."/".$linknumber.".html";
142	if ((-e $check_file)&&($failed == 0))
143	{
144	$linknumber++;
145	}
146	else
147	{
148	$failed++;
149	# I'm subtracting 1 from the starting link,
150	# just in case it only loaded half the page ;^)
151	if($linknumber>0)
152	{
153	$linknumber--;
154	}
155	print " Will start downloading at number $linknumber \n";
156	}
157	}
158
159	# if we're starting from scratch, then we might as well nuke the links file
160	#if ($linknumber == 0)
161	#{
162	# print " Starting from scratch - clobbering the old text files...\n";
163	# if (-e 'links.txt')
164	# {
165	# print " Removing links.txt...\n";
166	# unlink <links.txt> or print " ** Cannot delete links textfile: $!\n";
167	# }
168	# if (-e 'images.txt')
169	# {
170	# print " Removing images.txt...\n";
171	# unlink <images.txt> or print " ** Cannot delete images textfile: $!\n";
172	# }
173	# if (-e 'image_dirs.txt')
174	# {
175	# print " Removing image_dirs.txt...\n";
176	# unlink <image_dirs.txt> or print " ** Cannot delete image_dirs textfile: $!\n";
177	# }
178	#}
179
180	# if we're NOT starting from scratch, then read in old links from links text file
181	# and grab the old image-links as well...
182	if ($linknumber != 0)
183	{
184	# load the old links from links.txt, if it doesn't exist, then give up :(
185	my $this = "";
186	my $that = "";
187	open (CHECK, "links.txt") or die " Cannot find/open links.txt file!: $! \n";
188	while(eof CHECK == 0)
189	{
190	while($this ne "\n")
191	{
192	read CHECK, $this ,1;
193	$that = $that.$this;
194	}
195	$linkz_list[$linkz_pointer] = $that;
196
197	for my $search(0 .. (length($that) - 3))
198	{
199	if((substr($that, $search, 3) eq '?a=')\|\|(substr($that, $search, 3) eq '&a='))
200	{
201	$short_linkz_list[$linkz_pointer] = substr($that, $search);
202	last;
203	}
204	}
205	$linkz_pointer++;
206	$that = ""; $this = "";
207	}
208	close(CHECK);
209	print "- I found ",($#linkz_list + 1)," links in links.txt -\n";
210
211	#make sure that we start dl-ing the correct first page
212	$address = $linkz_list[$linknumber];
213
214	# load the old image links from image.txt (if it doesn't exist, no big deal ;)
215	my $im_this = "";
216	my $im_that = "";
217	open (IMAGES, "images.txt") \|\| print " Cannot find/open images.txt file! : $! \n";
218	while(eof IMAGES == 0)
219	{
220	while($im_this ne "\n")
221	{
222	read IMAGES, $im_this ,1;
223	$im_that = $im_that.$im_this;
224	}
225	$image_list[$image_pointer] = $im_that;
226	$image_pointer++;
227	$im_that = ""; $im_this = "";
228	}
229	close(IMAGES);
230	print "- I found ",($#image_list + 1)," picture-links in images.txt -\n";
231
232	#..and last but not least, load any image_dirs from image_dirs.txt
233	# again, if its not there, no big deal :)
234	my $imd_this = "";
235	my $imd_that = "";
236	open (IMAGE_DIR, "image_dirs.txt") \|\| print " Cannot find/open image_dirs.txt file!: $! \n";
237	while(eof IMAGE_DIR == 0)
238	{
239	while($imd_this ne "\n")
240	{
241	read IMAGE_DIR, $imd_this ,1;
242	$imd_that = $imd_that.$imd_this;
243	}
244	$image_dirs_list[$image_dirs_pointer] = $imd_that;
245	$image_dirs_pointer++;
246	$imd_that = ""; $imd_this = "";
247	}
248	close(IMAGE_DIR);
249	print "- I found ",($#image_dirs_list + 1)," picture directories in image_dirs.txt -\n";
250	}
251
252	# Just keep going till we can find no more new links
253	while(($#linkz_list < 0)\|\|($#linkz_list+1 > $linknumber))
254	{
255
256	# This line specifies the command line for the lynx web browser
257	my $kommand = $lynx.' -dump -image_links "'.$address.'"';
258
259	# dump page into text-array and find starting-point of the references/links
260	chomp(@data=`$kommand`);
261	for my $i(0 .. $#data)
262	{
263	if ($data[$i] eq "References") {
264	$here = $i;}
265	}
266	$here = $here+2;
267
268	# process references/links
269	for $i($here .. $#data){
270
271	$its_an_image = 0;
272
273	#chop-off refs leading number&spaces (eg. '1. http://www.cs.waikato.ac.nz')
274	# ^^^
275
276	# $temp = substr($data[$i],3);
277	# @temp = split(/ /, $temp, 2);
278
279	#check if the last 4 characters of the link equal .gif .jpg .png .bmp .pdf .mov .mpeg etc etc
280
281	# for my $g(0 .. $#graphic_formats)
282	# {
283	# if(substr($temp[1],(length($graphic_formats[$g]) * -1)) eq $graphic_formats[$g])
284	# {
285	# $its_an_image = 1;
286	# }
287	# }
288
289	$data[$i] =~ s/^\s*\d+\.\s+//;
290	if ($data[$i] =~ /\.(gif\|jpe?g\|png\|bmp\|pdf\|mov\|mpe?g\|rm)$/i) {
291	$its_an_image = 1;
292	}
293
294	# ignore mailto urls
295	if ($data[$i] !~ /mailto:/i) {
296
297	#----------- the link is NOT an image ----------------
298	if ($its_an_image == 0)
299	{
300	&its_a_link($data[$i], $outputdir);
301	}
302
303	#----------- the link IS an image ----------------
304	if ($its_an_image != 0)
305	{
306	&its_an_image($data[$i], $finaldir);
307	}
308	}
309	}
310
311	# save the web page to disk (in the appropriate numbered directory)
312	$kommand = $lynx.' -dump -source "'.$address.'"';
313
314	if ($linknumber % $dir_entries == 0)
315	{
316	if ((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber))
317	{
318	print " ** ",$outputdir.$linknumber, " - Directory allready exists.\n";
319	}
320	else
321	{
322	mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
323	mkdir($finaldir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
324	}
325	$numberdir = $linknumber;
326	}
327	my $target = $outputdir.$numberdir."/".$linknumber.".html";
328
329	#---------------------------------------------------------------------------------------------------------------
330	# NOTE: This next command will NOT work under win32/dos, as redirecting standard output in a script causes it to
331	# be dumped straight to the screen as opposed to into the target file.
332	#---------------------------------------------------------------------------------------------------------------
333	system ("$kommand > $target");
334	#---------------------------------------------------------------------------------------------------------------
335
336	print " Saved $target\n";
337
338	$linknumber++;
339
340	$address = $linkz_list[$linknumber];
341	}
342
343	my $end_time = (times)[0];
344
345	print "\n\n\n ----------------------------\n";
346	print " \| Whew! Task completed! :-D \|\n";
347	print " ----------------------------\n";
348	printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
349	print "\n\n";
350	print " Now execute the process_html.pl script to link the downloaded collection together.\n";
351	print " Please do make sure that it is executed with the same options as this script ;-)\n";
352
353	sub its_a_link
354	{
355	local($found) = @_;
356	# local($ok = 0, $kommand);
357	local($kommand);
358	local $short_link = "";
359
360	return if ($found =~ /\#.*$/);
361
362	# attach the custom options
363	$found .= $option;
364
365	#little bit of trickery here - check if there is a &d= option present in the link
366	#if there is, then wipe the &cl= option!
367	#This should cut down multiple copies by 75%!!
368
369	#but, if there is no &d option, and the &cl option is not set, then we have to set the &cl option to something
370	#otherwise we get pages which contain no data :\
371
372	if ($found =~ /[&\?]a=d/) {
373	if ($found =~ /[&\?]d=/) {
374	$found =~ s/[&\?]cl=[^&]*//;
375	} elsif ($found !~ /[&\?]cl=/) {
376	$found .= $fix_empty_pages;
377	}
378	}
379
380	# we also want to sort out any xxx.pr OIDs that we come across
381	$found =~ s/([&\?](cl\|d)=.*?)\.\d+\.pr/$1/g;
382
383	# attach the EOL character.
384	$found = $found."\n";
385
386
387	# the hard way !!!
388	# for my $search(0 .. (length($found) - 3))
389	# {
390	# if((substr($found, $search, 3) eq '?d=')\|\|(substr($found, $search, 3) eq '&d='))
391	# {
392	# for my $second_search(0 .. (length($found) - 4))
393	# {
394	# if((substr($found, $second_search, 4) eq '?cl=')\|\|(substr($found, $second_search, 4) eq '&cl='))
395	# {
396	# for my $third_search(($second_search + 3) .. (length($found) - 1))
397	# {
398	# if((substr($found, $third_search, 1)) eq '&')
399	# {
400	# substr($found, $second_search, $third_search - $second_search) = "";
401	# last;
402	# }
403	# }
404	# last;
405	# }
406	# }
407	# last;
408	# }
409	# else
410	# {
411	# if( $search == (length($found) - 3))
412	# {
413	# for my $second_search(0 .. (length($found) - 4))
414	# {
415	# if((substr($found, $second_search, 4) eq '?cl=')\|\|(substr($found, $second_search, 4) eq '&cl='))
416	# {
417	# for my $third_search(($second_search + 3) .. (length($found) - 1))
418	# {
419	# if((substr($found, $third_search, 1)) eq '&')
420	# {
421	# if (substr($found, $second_search, $third_search - $second_search) eq '&cl=')
422	# {
423	# substr($found, $second_search, $third_search - $second_search) = $fix_empty_pages;
424	# }
425	# last;
426	# }
427	# }
428	# last;
429	# }
430	# }
431	# }
432	# }
433	# }
434
435	# grab the last part of the link (ignoring the start and the &e option)
436	# for my $search(0 .. (length($found) - 3))
437	# {
438	# if((substr($found, $search, 3) eq '?a=')\|\|(substr($found, $search, 3) eq '&a='))
439	# {
440	# $short_link = substr($found, $search);
441	# last;
442	# }
443	# }
444
445	($short_link) = $found =~ /\?(.*)$/;
446	$short_link =~ s/(^\|&)e=[^&]*/$1/;
447
448
449	# this filters out multiple copies of for example the help page, which has #something at the end of its links
450	# now do this first with regular expression above -- Stefan
451
452	# for my $search(0 .. length($found))
453	# {
454	# if ((substr($found, $search, 1)) eq '#')
455	# {
456	# $ok++;
457	# last;
458	# }
459	# }
460
461
462
463	# compare the found link to the links we've stored in the arrays (compares both full link and partial link)
464	for my $search(0 .. $#linkz_list)
465	{
466	return if ($found eq $linkz_list[$search]);
467	return if ($short_link eq $short_linkz_list[$search]);
468	}
469
470	# if found link is not in links array, add it
471	open (DUMP, ">>links.txt") or die " ** Can't open links.txt!: $!\n";
472	print DUMP $found;
473	close(DUMP);
474
475	$linkz_list[$linkz_pointer] = $found;
476	$short_linkz_list[$linkz_pointer] = $short_link;
477	$linkz_pointer++;
478	}
479
480	sub do_image_dirs
481	{
482	local($found) = @_;
483	my $count = 0;
484	my @br_index;
485	my $image_dir = "";
486	my $new_dir = 0;
487
488	for my $search(1 .. (length($found) - 1 ))
489	{
490	$bracket = substr($found, ($search * - 1), 1);
491	if ($bracket eq '/')
492	{
493	$count++;
494	$br_index[$count] = $search;
495	}
496	if($count == 2)
497	{
498	$image_dir = substr($found, ($br_index[2] * -1) , ($br_index[2] - $br_index[1]));
499	}
500	}
501
502	my $dirs_to_wipe = substr($found, $br_index[$#br_index - 2] * - 1, $br_index[$#br_index - 2] - $br_index[2] + 1)."\n";
503
504	for my $counter(0 .. $#image_dirs_list)
505	{
506	if($dirs_to_wipe eq $image_dirs_list[$counter])
507	{
508	$new_dir++;
509	}
510	}
511
512	if ($new_dir == 0)
513	{
514	open (IMAGE_DIRS, ">>image_dirs.txt") or die " ** Can't open image_dirs.txt!: $!\n";
515	print IMAGE_DIRS $dirs_to_wipe;
516	close(IMAGE_DIRS);
517	$image_dirs_list[$image_dirs_pointer] = $dirs_to_wipe;
518	$image_dirs_pointer++;
519	}
520
521	print " ",substr($finaldir, 0 ,length($finaldir) - 1).$image_dir.substr($found, ($br_index[1] * - 1), length($found) - (length($found) - $br_index[1])),"\n";
522
523	return $image_dir;
524	}
525
526	sub its_an_image
527	{
528	local($found, $outpdir) = @_;
529	local($kommand);
530	my $new = 0;
531
532	my $temp_found = $found . "\n";
533
534	# check if the image is in the list
535	for my $counter(0 .. $#image_list)
536	{
537	if($temp_found eq $image_list[$counter])
538	{
539	$new++;
540	}
541	}
542
543	# only download the image if its not in the list..
544	if($new == 0)
545	{
546	my $image_dir = &do_image_dirs;
547	my $temp_outputdir = $outpdir;
548	if (substr($temp_outputdir, -1, 1) eq "/")
549	{
550	substr($temp_outputdir, -1, 1) = "";
551	}
552
553	# the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
554	&util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
555
556	# wget is set to 'q - quiet' and 'nc - dont clobber existing files'
557	$kommand = $wget.' -qnc --directory-prefix='.$temp_outputdir.$image_dir.' "'.$found.'"';
558	system ("$kommand");
559
560	open (IMAGES, ">>images.txt") or die " ** Can't open images.txt!: $!\n";
561	print IMAGES $temp_found;
562	close(IMAGES);
563
564	$image_list[$image_pointer] = $temp_found;
565	$image_pointer++;
566
567	# grab corresponding ON pictures for navigation bar if we've just dl-ed the OFF picture
568	if(substr($found , -6) eq "of.gif")
569	{
570	substr($found, -6, 6) = "on.gif";
571	&its_an_image($found, $outpdir);
572	}
573	}
574	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/bin/script/grab_collection.pl@ 32130

Download in other formats: