source: main/trunk/greenstone2/bin/script/grab_collection.pl@ 31888

Last change on this file since 31888 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1#!/usr/bin/perl -w
2
3use util;
4
5# Both this script and its associated process_html.pl were written by
6# Marcel ?, while a student at Waikato University. Unfortunately he
7# was very new to perl at the time so the code is neither as clean nor
8# as fast as it could be (I've cleaned up a few of the more serious
9# bottlenecks -- it could do with alot more work though). The code
10# does work though, if a little slowly. It's not ready for primetime
11# however and is included in the Greenstone source tree mostly so that
12# I don't lose it. -- Stefan - 24 Jul 2001
13
14
15# This script will download an entire collection (and its associated pictures and files)
16# and store them in a temporary directory ($outputdir).
17# A second script (process_html.pl) can be then be used to 'rebuild' the collection and link all the
18# downloaded pages and pictures together into a usable static collection.
19
20# This script will generate a number of text files required by the second script
21# and for possible recovery, namely:
22# - links.txt - contains all the http links that were downloaded
23# - images.txt - contains a list of all the images that were downloaded
24# - image_dirs.txt - contains a list of all the image-prefixes that need to be wiped from the html files
25# (otherwise you won't get to see any pictures)
26
27# Both this script and the html processing script have a recovery feature built in, they can continue from wherever
28# they left off, but this only works if $outputdir and $finaldir are set to different values.
29
30# This is where all the downloaded html files end up, eg. 'temp_html/'
31my $outputdir = 'temp_html/';
32# This is where all the processed html files (and the pictures) will end up eg. 'my_static_collection/'
33my $finaldir = 'envl_collection/';
34
35# This is where we start our mirroring
36$address = 'http://nowhere.com/cgi-bin/library?a=p&p=about&c=demo&u=1';
37
38
39# whatever is specified in $option will be attached to the end of each html link before it is downloaded.
40# eg. "&u=1" to disable various features in the greenstone collections that are not needed with a static
41# collection.
42# another example : "&l=nl" to set the entire collection to dutch (NetherLands :)
43my $option = "&u=1";
44
45# Most OS have a limit on the maximum amount of files per directory (or folder).
46# A static collection can easily contain >3000 html files. Putting all those files
47# into one single directory is just asking for trouble. It's also very unwieldy ;^)
48# Hence...this value will set how much html files will be stored in one directory.
49# These directories themselves will be numbered,
50# so if $dir_entries = 500 then the directories will be "0/", "500/", "1000/", "1500/", "2000/", etc.
51my $dir_entries = 250;
52
53# Occasionally a page occurs which contains no data (because &cl is not set) This option fixes that.
54my $fix_empty_pages = "&cl=CL1";
55
56# These are the files that wget will download.
57# more can be added if necessary.
58my @graphic_formats = ('.gif','.jpg','.bmp','.png','.pdf','.mov','.mpeg','.jpeg','.rm');
59
60# ---------------------------------------[ System specific options ]----------------------------------------------------
61
62# The lynx variable specifies the command line for the lynx web browser
63# -- This is what I use under dos/win32
64# my $lynx = 'e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg';
65
66# -- This is what I use under linux
67my $lynx = 'lynx';
68
69# and the same for the wget utility
70my $wget = 'wget';
71
72# NB: There is one other linux specific command all the way at the end of this script, where I've used 'cp' to copy a file.
73
74# Another NB: When saving the dl-ed html files to disk, I've set lynx to dump the html-source to the standard output,
75# which I then simply redirect to a target file, BUT
76# this does not work under DOS/win32. Redirecting standard output in a script causes it to be displayed on
77# the screen instead. The easiest way to get around this I found was by doing the actual redirection in a simple
78# batch file (say grab.bat), which contains the following line:
79# @e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg -dump -source "%1" > %2
80#
81# Then replace line nr 326 -> 'system ("$kommand > $target");' with 'system("grab.bat $address $target");'
82# Not a very elegant solution, but it works :)
83
84#------------------------------------------------------------------------------------------------------------------------
85
86my %image_list;
87my $image_pointer = 0;
88
89my %linkz_list;
90my %short_linkz_list;
91my $linkz_pointer = 0;
92
93my %image_dirs_list;
94my $image_dirs_pointer = 0;
95
96my $numberdir = 0;
97
98my $start_time = (times)[0];
99
100# check if directories exist and create them if necessary..
101if ((-e $outputdir)&&(-d $outputdir))
102{
103 print " ** ",$outputdir," directory already exists..\n";
104}
105else
106{
107 print " ** Creating ",$outputdir," directory..\n";
108 mkdir($outputdir, 0777) or die " Cannot create output directory: $!\n";
109}
110
111if ((-e $finaldir)&&(-d $finaldir))
112{
113 print " ** ",$finaldir," directory already exists..\n";
114}
115else
116{
117 print " ** Creating ",$finaldir," directory..\n";
118 mkdir($finaldir, 0777) or die " Cannot create final directory: $!\n";
119}
120
121#-----------------------------------------------------------------------------------------------
122# No need to start from scratch everytime, we can recover/continue from wherever we left off
123# simply by checking which html files have been created
124#-----------------------------------------------------------------------------------------------
125
126$linknumber = 0; # used to name/number the dl-ed html files
127
128my $failed = 0;
129while ($failed == 0)
130{
131 if ($linknumber % $dir_entries == 0)
132 {
133 if (!((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber)))
134 {
135 $failed++;
136 mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
137 }
138 $numberdir = $linknumber;
139 }
140
141 $check_file = $outputdir.$numberdir."/".$linknumber.".html";
142 if ((-e $check_file)&&($failed == 0))
143 {
144 $linknumber++;
145 }
146 else
147 {
148 $failed++;
149 # I'm subtracting 1 from the starting link,
150 # just in case it only loaded half the page ;^)
151 if($linknumber>0)
152 {
153 $linknumber--;
154 }
155 print " Will start downloading at number $linknumber \n";
156 }
157}
158
159# if we're starting from scratch, then we might as well nuke the links file
160#if ($linknumber == 0)
161#{
162# print " Starting from scratch - clobbering the old text files...\n";
163# if (-e 'links.txt')
164# {
165# print " Removing links.txt...\n";
166# unlink <links.txt> or print " ** Cannot delete links textfile: $!\n";
167# }
168# if (-e 'images.txt')
169# {
170# print " Removing images.txt...\n";
171# unlink <images.txt> or print " ** Cannot delete images textfile: $!\n";
172# }
173# if (-e 'image_dirs.txt')
174# {
175# print " Removing image_dirs.txt...\n";
176# unlink <image_dirs.txt> or print " ** Cannot delete image_dirs textfile: $!\n";
177# }
178#}
179
180# if we're NOT starting from scratch, then read in old links from links text file
181# and grab the old image-links as well...
182if ($linknumber != 0)
183{
184 # load the old links from links.txt, if it doesn't exist, then give up :(
185 my $this = "";
186 my $that = "";
187 open (CHECK, "links.txt") or die " ** Cannot find/open links.txt file!: $! **\n";
188 while(eof CHECK == 0)
189 {
190 while($this ne "\n")
191 {
192 read CHECK, $this ,1;
193 $that = $that.$this;
194 }
195 $linkz_list[$linkz_pointer] = $that;
196
197 for my $search(0 .. (length($that) - 3))
198 {
199 if((substr($that, $search, 3) eq '?a=')||(substr($that, $search, 3) eq '&a='))
200 {
201 $short_linkz_list[$linkz_pointer] = substr($that, $search);
202 last;
203 }
204 }
205 $linkz_pointer++;
206 $that = ""; $this = "";
207 }
208 close(CHECK);
209 print "- I found ",($#linkz_list + 1)," links in links.txt -\n";
210
211 #make sure that we start dl-ing the correct first page
212 $address = $linkz_list[$linknumber];
213
214 # load the old image links from image.txt (if it doesn't exist, no big deal ;)
215 my $im_this = "";
216 my $im_that = "";
217 open (IMAGES, "images.txt") || print " ** Cannot find/open images.txt file! : $! **\n";
218 while(eof IMAGES == 0)
219 {
220 while($im_this ne "\n")
221 {
222 read IMAGES, $im_this ,1;
223 $im_that = $im_that.$im_this;
224 }
225 $image_list[$image_pointer] = $im_that;
226 $image_pointer++;
227 $im_that = ""; $im_this = "";
228 }
229 close(IMAGES);
230 print "- I found ",($#image_list + 1)," picture-links in images.txt -\n";
231
232 #..and last but not least, load any image_dirs from image_dirs.txt
233 # again, if its not there, no big deal :)
234 my $imd_this = "";
235 my $imd_that = "";
236 open (IMAGE_DIR, "image_dirs.txt") || print " ** Cannot find/open image_dirs.txt file!: $! **\n";
237 while(eof IMAGE_DIR == 0)
238 {
239 while($imd_this ne "\n")
240 {
241 read IMAGE_DIR, $imd_this ,1;
242 $imd_that = $imd_that.$imd_this;
243 }
244 $image_dirs_list[$image_dirs_pointer] = $imd_that;
245 $image_dirs_pointer++;
246 $imd_that = ""; $imd_this = "";
247 }
248 close(IMAGE_DIR);
249 print "- I found ",($#image_dirs_list + 1)," picture directories in image_dirs.txt -\n";
250}
251
252# Just keep going till we can find no more new links
253while(($#linkz_list < 0)||($#linkz_list+1 > $linknumber))
254{
255
256 # This line specifies the command line for the lynx web browser
257 my $kommand = $lynx.' -dump -image_links "'.$address.'"';
258
259 # dump page into text-array and find starting-point of the references/links
260 chomp(@data=`$kommand`);
261 for my $i(0 .. $#data)
262 {
263 if ($data[$i] eq "References") {
264 $here = $i;}
265 }
266 $here = $here+2;
267
268 # process references/links
269 for $i($here .. $#data){
270
271 $its_an_image = 0;
272
273 #chop-off refs leading number&spaces (eg. '1. http://www.cs.waikato.ac.nz')
274 # ^^^
275
276# $temp = substr($data[$i],3);
277# @temp = split(/ /, $temp, 2);
278
279 #check if the last 4 characters of the link equal .gif .jpg .png .bmp .pdf .mov .mpeg etc etc
280
281# for my $g(0 .. $#graphic_formats)
282# {
283# if(substr($temp[1],(length($graphic_formats[$g]) * -1)) eq $graphic_formats[$g])
284# {
285# $its_an_image = 1;
286# }
287# }
288
289 $data[$i] =~ s/^\s*\d+\.\s+//;
290 if ($data[$i] =~ /\.(gif|jpe?g|png|bmp|pdf|mov|mpe?g|rm)$/i) {
291 $its_an_image = 1;
292 }
293
294 # ignore mailto urls
295 if ($data[$i] !~ /mailto:/i) {
296
297 #----------- the link is NOT an image ----------------
298 if ($its_an_image == 0)
299 {
300 &its_a_link($data[$i], $outputdir);
301 }
302
303 #----------- the link IS an image ----------------
304 if ($its_an_image != 0)
305 {
306 &its_an_image($data[$i], $finaldir);
307 }
308 }
309 }
310
311 # save the web page to disk (in the appropriate numbered directory)
312 $kommand = $lynx.' -dump -source "'.$address.'"';
313
314 if ($linknumber % $dir_entries == 0)
315 {
316 if ((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber))
317 {
318 print " ** ",$outputdir.$linknumber, " - Directory allready exists.\n";
319 }
320 else
321 {
322 mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
323 mkdir($finaldir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
324 }
325 $numberdir = $linknumber;
326 }
327 my $target = $outputdir.$numberdir."/".$linknumber.".html";
328
329 #---------------------------------------------------------------------------------------------------------------
330 # NOTE: This next command will NOT work under win32/dos, as redirecting standard output in a script causes it to
331 # be dumped straight to the screen as opposed to into the target file.
332 #---------------------------------------------------------------------------------------------------------------
333 system ("$kommand > $target");
334 #---------------------------------------------------------------------------------------------------------------
335
336 print " Saved $target\n";
337
338 $linknumber++;
339
340 $address = $linkz_list[$linknumber];
341}
342
343my $end_time = (times)[0];
344
345print "\n\n\n *----------------------------*\n";
346print " | Whew! Task completed! :-D |\n";
347print " *----------------------------*\n";
348printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
349print "\n\n";
350print " Now execute the process_html.pl script to link the downloaded collection together.\n";
351print " Please do make sure that it is executed with the same options as this script ;-)\n";
352
353sub its_a_link
354{
355 local($found) = @_;
356# local($ok = 0, $kommand);
357 local($kommand);
358 local $short_link = "";
359
360 return if ($found =~ /\#.*$/);
361
362 # attach the custom options
363 $found .= $option;
364
365 #little bit of trickery here - check if there is a &d= option present in the link
366 #if there is, then wipe the &cl= option!
367 #This should cut down multiple copies by 75%!!
368
369 #but, if there is no &d option, and the &cl option is not set, then we have to set the &cl option to something
370 #otherwise we get pages which contain no data :\
371
372 if ($found =~ /[&\?]a=d/) {
373 if ($found =~ /[&\?]d=/) {
374 $found =~ s/[&\?]cl=[^&]*//;
375 } elsif ($found !~ /[&\?]cl=/) {
376 $found .= $fix_empty_pages;
377 }
378 }
379
380 # we also want to sort out any xxx.pr OIDs that we come across
381 $found =~ s/([&\?](cl|d)=.*?)\.\d+\.pr/$1/g;
382
383 # attach the EOL character.
384 $found = $found."\n";
385
386
387 # the hard way !!!
388# for my $search(0 .. (length($found) - 3))
389# {
390# if((substr($found, $search, 3) eq '?d=')||(substr($found, $search, 3) eq '&d='))
391# {
392# for my $second_search(0 .. (length($found) - 4))
393# {
394# if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
395# {
396# for my $third_search(($second_search + 3) .. (length($found) - 1))
397# {
398# if((substr($found, $third_search, 1)) eq '&')
399# {
400# substr($found, $second_search, $third_search - $second_search) = "";
401# last;
402# }
403# }
404# last;
405# }
406# }
407# last;
408# }
409# else
410# {
411# if( $search == (length($found) - 3))
412# {
413# for my $second_search(0 .. (length($found) - 4))
414# {
415# if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
416# {
417# for my $third_search(($second_search + 3) .. (length($found) - 1))
418# {
419# if((substr($found, $third_search, 1)) eq '&')
420# {
421# if (substr($found, $second_search, $third_search - $second_search) eq '&cl=')
422# {
423# substr($found, $second_search, $third_search - $second_search) = $fix_empty_pages;
424# }
425# last;
426# }
427# }
428# last;
429# }
430# }
431# }
432# }
433# }
434
435 # grab the last part of the link (ignoring the start and the &e option)
436# for my $search(0 .. (length($found) - 3))
437# {
438# if((substr($found, $search, 3) eq '?a=')||(substr($found, $search, 3) eq '&a='))
439# {
440# $short_link = substr($found, $search);
441# last;
442# }
443# }
444
445 ($short_link) = $found =~ /\?(.*)$/;
446 $short_link =~ s/(^|&)e=[^&]*/$1/;
447
448
449 # this filters out multiple copies of for example the help page, which has #something at the end of its links
450 # now do this first with regular expression above -- Stefan
451
452# for my $search(0 .. length($found))
453# {
454# if ((substr($found, $search, 1)) eq '#')
455# {
456# $ok++;
457# last;
458# }
459# }
460
461
462
463 # compare the found link to the links we've stored in the arrays (compares both full link and partial link)
464 for my $search(0 .. $#linkz_list)
465 {
466 return if ($found eq $linkz_list[$search]);
467 return if ($short_link eq $short_linkz_list[$search]);
468 }
469
470 # if found link is not in links array, add it
471 open (DUMP, ">>links.txt") or die " ** Can't open links.txt!: $!\n";
472 print DUMP $found;
473 close(DUMP);
474
475 $linkz_list[$linkz_pointer] = $found;
476 $short_linkz_list[$linkz_pointer] = $short_link;
477 $linkz_pointer++;
478}
479
480sub do_image_dirs
481{
482 local($found) = @_;
483 my $count = 0;
484 my @br_index;
485 my $image_dir = "";
486 my $new_dir = 0;
487
488 for my $search(1 .. (length($found) - 1 ))
489 {
490 $bracket = substr($found, ($search * - 1), 1);
491 if ($bracket eq '/')
492 {
493 $count++;
494 $br_index[$count] = $search;
495 }
496 if($count == 2)
497 {
498 $image_dir = substr($found, ($br_index[2] * -1) , ($br_index[2] - $br_index[1]));
499 }
500 }
501
502 my $dirs_to_wipe = substr($found, $br_index[$#br_index - 2] * - 1, $br_index[$#br_index - 2] - $br_index[2] + 1)."\n";
503
504 for my $counter(0 .. $#image_dirs_list)
505 {
506 if($dirs_to_wipe eq $image_dirs_list[$counter])
507 {
508 $new_dir++;
509 }
510 }
511
512 if ($new_dir == 0)
513 {
514 open (IMAGE_DIRS, ">>image_dirs.txt") or die " ** Can't open image_dirs.txt!: $!\n";
515 print IMAGE_DIRS $dirs_to_wipe;
516 close(IMAGE_DIRS);
517 $image_dirs_list[$image_dirs_pointer] = $dirs_to_wipe;
518 $image_dirs_pointer++;
519 }
520
521 print " ",substr($finaldir, 0 ,length($finaldir) - 1).$image_dir.substr($found, ($br_index[1] * - 1), length($found) - (length($found) - $br_index[1])),"\n";
522
523 return $image_dir;
524}
525
526sub its_an_image
527{
528 local($found, $outpdir) = @_;
529 local($kommand);
530 my $new = 0;
531
532 my $temp_found = $found . "\n";
533
534 # check if the image is in the list
535 for my $counter(0 .. $#image_list)
536 {
537 if($temp_found eq $image_list[$counter])
538 {
539 $new++;
540 }
541 }
542
543 # only download the image if its not in the list..
544 if($new == 0)
545 {
546 my $image_dir = &do_image_dirs;
547 my $temp_outputdir = $outpdir;
548 if (substr($temp_outputdir, -1, 1) eq "/")
549 {
550 substr($temp_outputdir, -1, 1) = "";
551 }
552
553 # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
554 &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
555
556 # wget is set to 'q - quiet' and 'nc - dont clobber existing files'
557 $kommand = $wget.' -qnc --directory-prefix='.$temp_outputdir.$image_dir.' "'.$found.'"';
558 system ("$kommand");
559
560 open (IMAGES, ">>images.txt") or die " ** Can't open images.txt!: $!\n";
561 print IMAGES $temp_found;
562 close(IMAGES);
563
564 $image_list[$image_pointer] = $temp_found;
565 $image_pointer++;
566
567 # grab corresponding ON pictures for navigation bar if we've just dl-ed the OFF picture
568 if(substr($found , -6) eq "of.gif")
569 {
570 substr($found, -6, 6) = "on.gif";
571 &its_an_image($found, $outpdir);
572 }
573 }
574}
Note: See TracBrowser for help on using the repository browser.