source: gsdl/trunk/bin/script/grab_collection.pl@ 18470

Last change on this file since 18470 was 2671, checked in by sjboddie, 23 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1#!/usr/bin/perl -w
2
3
4# Both this script and its associated process_html.pl were written by
5# Marcel ?, while a student at Waikato University. Unfortunately he
6# was very new to perl at the time so the code is neither as clean nor
7# as fast as it could be (I've cleaned up a few of the more serious
8# bottlenecks -- it could do with alot more work though). The code
9# does work though, if a little slowly. It's not ready for primetime
10# however and is included in the Greenstone source tree mostly so that
11# I don't lose it. -- Stefan - 24 Jul 2001
12
13
14# This script will download an entire collection (and its associated pictures and files)
15# and store them in a temporary directory ($outputdir).
16# A second script (process_html.pl) can be then be used to 'rebuild' the collection and link all the
17# downloaded pages and pictures together into a usable static collection.
18
19# This script will generate a number of text files required by the second script
20# and for possible recovery, namely:
21# - links.txt - contains all the http links that were downloaded
22# - images.txt - contains a list of all the images that were downloaded
23# - image_dirs.txt - contains a list of all the image-prefixes that need to be wiped from the html files
24# (otherwise you won't get to see any pictures)
25
26# Both this script and the html processing script have a recovery feature built in, they can continue from wherever
27# they left off, but this only works if $outputdir and $finaldir are set to different values.
28
29# This is where all the downloaded html files end up, eg. 'temp_html/'
30my $outputdir = 'temp_html/';
31# This is where all the processed html files (and the pictures) will end up eg. 'my_static_collection/'
32my $finaldir = 'envl_collection/';
33
34# This is where we start our mirroring
35$address = 'http://nowhere.com/cgi-bin/library?a=p&p=about&c=demo&u=1';
36
37
38# whatever is specified in $option will be attached to the end of each html link before it is downloaded.
39# eg. "&u=1" to disable various features in the greenstone collections that are not needed with a static
40# collection.
41# another example : "&l=nl" to set the entire collection to dutch (NetherLands :)
42my $option = "&u=1";
43
44# Most OS have a limit on the maximum amount of files per directory (or folder).
45# A static collection can easily contain >3000 html files. Putting all those files
46# into one single directory is just asking for trouble. It's also very unwieldy ;^)
47# Hence...this value will set how much html files will be stored in one directory.
48# These directories themselves will be numbered,
49# so if $dir_entries = 500 then the directories will be "0/", "500/", "1000/", "1500/", "2000/", etc.
50my $dir_entries = 250;
51
52# Occasionally a page occurs which contains no data (because &cl is not set) This option fixes that.
53my $fix_empty_pages = "&cl=CL1";
54
55# These are the files that wget will download.
56# more can be added if necessary.
57my @graphic_formats = ('.gif','.jpg','.bmp','.png','.pdf','.mov','.mpeg','.jpeg','.rm');
58
59# ---------------------------------------[ System specific options ]----------------------------------------------------
60
61# The lynx variable specifies the command line for the lynx web browser
62# -- This is what I use under dos/win32
63# my $lynx = 'e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg';
64
65# -- This is what I use under linux
66my $lynx = 'lynx';
67
68# and the same for the wget utility
69my $wget = 'wget';
70
71# NB: There is one other linux specific command all the way at the end of this script, where I've used 'cp' to copy a file.
72
73# Another NB: When saving the dl-ed html files to disk, I've set lynx to dump the html-source to the standard output,
74# which I then simply redirect to a target file, BUT
75# this does not work under DOS/win32. Redirecting standard output in a script causes it to be displayed on
76# the screen instead. The easiest way to get around this I found was by doing the actual redirection in a simple
77# batch file (say grab.bat), which contains the following line:
78# @e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg -dump -source "%1" > %2
79#
80# Then replace line nr 326 -> 'system ("$kommand > $target");' with 'system("grab.bat $address $target");'
81# Not a very elegant solution, but it works :)
82
83#------------------------------------------------------------------------------------------------------------------------
84
85my %image_list;
86my $image_pointer = 0;
87
88my %linkz_list;
89my %short_linkz_list;
90my $linkz_pointer = 0;
91
92my %image_dirs_list;
93my $image_dirs_pointer = 0;
94
95my $numberdir = 0;
96
97my $start_time = (times)[0];
98
99# check if directories exist and create them if necessary..
100if ((-e $outputdir)&&(-d $outputdir))
101{
102 print " ** ",$outputdir," directory already exists..\n";
103}
104else
105{
106 print " ** Creating ",$outputdir," directory..\n";
107 mkdir($outputdir, 0777) or die " Cannot create output directory: $!\n";
108}
109
110if ((-e $finaldir)&&(-d $finaldir))
111{
112 print " ** ",$finaldir," directory already exists..\n";
113}
114else
115{
116 print " ** Creating ",$finaldir," directory..\n";
117 mkdir($finaldir, 0777) or die " Cannot create final directory: $!\n";
118}
119
120#-----------------------------------------------------------------------------------------------
121# No need to start from scratch everytime, we can recover/continue from wherever we left off
122# simply by checking which html files have been created
123#-----------------------------------------------------------------------------------------------
124
125$linknumber = 0; # used to name/number the dl-ed html files
126
127my $failed = 0;
128while ($failed == 0)
129{
130 if ($linknumber % $dir_entries == 0)
131 {
132 if (!((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber)))
133 {
134 $failed++;
135 mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
136 }
137 $numberdir = $linknumber;
138 }
139
140 $check_file = $outputdir.$numberdir."/".$linknumber.".html";
141 if ((-e $check_file)&&($failed == 0))
142 {
143 $linknumber++;
144 }
145 else
146 {
147 $failed++;
148 # I'm subtracting 1 from the starting link,
149 # just in case it only loaded half the page ;^)
150 if($linknumber>0)
151 {
152 $linknumber--;
153 }
154 print " Will start downloading at number $linknumber \n";
155 }
156}
157
158# if we're starting from scratch, then we might as well nuke the links file
159#if ($linknumber == 0)
160#{
161# print " Starting from scratch - clobbering the old text files...\n";
162# if (-e 'links.txt')
163# {
164# print " Removing links.txt...\n";
165# unlink <links.txt> or print " ** Cannot delete links textfile: $!\n";
166# }
167# if (-e 'images.txt')
168# {
169# print " Removing images.txt...\n";
170# unlink <images.txt> or print " ** Cannot delete images textfile: $!\n";
171# }
172# if (-e 'image_dirs.txt')
173# {
174# print " Removing image_dirs.txt...\n";
175# unlink <image_dirs.txt> or print " ** Cannot delete image_dirs textfile: $!\n";
176# }
177#}
178
179# if we're NOT starting from scratch, then read in old links from links text file
180# and grab the old image-links as well...
181if ($linknumber != 0)
182{
183 # load the old links from links.txt, if it doesn't exist, then give up :(
184 my $this = "";
185 my $that = "";
186 open (CHECK, "links.txt") or die " ** Cannot find/open links.txt file!: $! **\n";
187 while(eof CHECK == 0)
188 {
189 while($this ne "\n")
190 {
191 read CHECK, $this ,1;
192 $that = $that.$this;
193 }
194 $linkz_list[$linkz_pointer] = $that;
195
196 for my $search(0 .. (length($that) - 3))
197 {
198 if((substr($that, $search, 3) eq '?a=')||(substr($that, $search, 3) eq '&a='))
199 {
200 $short_linkz_list[$linkz_pointer] = substr($that, $search);
201 last;
202 }
203 }
204 $linkz_pointer++;
205 $that = ""; $this = "";
206 }
207 close(CHECK);
208 print "- I found ",($#linkz_list + 1)," links in links.txt -\n";
209
210 #make sure that we start dl-ing the correct first page
211 $address = $linkz_list[$linknumber];
212
213 # load the old image links from image.txt (if it doesn't exist, no big deal ;)
214 my $im_this = "";
215 my $im_that = "";
216 open (IMAGES, "images.txt") || print " ** Cannot find/open images.txt file! : $! **\n";
217 while(eof IMAGES == 0)
218 {
219 while($im_this ne "\n")
220 {
221 read IMAGES, $im_this ,1;
222 $im_that = $im_that.$im_this;
223 }
224 $image_list[$image_pointer] = $im_that;
225 $image_pointer++;
226 $im_that = ""; $im_this = "";
227 }
228 close(IMAGES);
229 print "- I found ",($#image_list + 1)," picture-links in images.txt -\n";
230
231 #..and last but not least, load any image_dirs from image_dirs.txt
232 # again, if its not there, no big deal :)
233 my $imd_this = "";
234 my $imd_that = "";
235 open (IMAGE_DIR, "image_dirs.txt") || print " ** Cannot find/open image_dirs.txt file!: $! **\n";
236 while(eof IMAGE_DIR == 0)
237 {
238 while($imd_this ne "\n")
239 {
240 read IMAGE_DIR, $imd_this ,1;
241 $imd_that = $imd_that.$imd_this;
242 }
243 $image_dirs_list[$image_dirs_pointer] = $imd_that;
244 $image_dirs_pointer++;
245 $imd_that = ""; $imd_this = "";
246 }
247 close(IMAGE_DIR);
248 print "- I found ",($#image_dirs_list + 1)," picture directories in image_dirs.txt -\n";
249}
250
251# Just keep going till we can find no more new links
252while(($#linkz_list < 0)||($#linkz_list+1 > $linknumber))
253{
254
255 # This line specifies the command line for the lynx web browser
256 my $kommand = $lynx.' -dump -image_links "'.$address.'"';
257
258 # dump page into text-array and find starting-point of the references/links
259 chomp(@data=`$kommand`);
260 for my $i(0 .. $#data)
261 {
262 if ($data[$i] eq "References") {
263 $here = $i;}
264 }
265 $here = $here+2;
266
267 # process references/links
268 for $i($here .. $#data){
269
270 $its_an_image = 0;
271
272 #chop-off refs leading number&spaces (eg. '1. http://www.cs.waikato.ac.nz')
273 # ^^^
274
275# $temp = substr($data[$i],3);
276# @temp = split(/ /, $temp, 2);
277
278 #check if the last 4 characters of the link equal .gif .jpg .png .bmp .pdf .mov .mpeg etc etc
279
280# for my $g(0 .. $#graphic_formats)
281# {
282# if(substr($temp[1],(length($graphic_formats[$g]) * -1)) eq $graphic_formats[$g])
283# {
284# $its_an_image = 1;
285# }
286# }
287
288 $data[$i] =~ s/^\s*\d+\.\s+//;
289 if ($data[$i] =~ /\.(gif|jpe?g|png|bmp|pdf|mov|mpe?g|rm)$/i) {
290 $its_an_image = 1;
291 }
292
293 # ignore mailto urls
294 if ($data[$i] !~ /mailto:/i) {
295
296 #----------- the link is NOT an image ----------------
297 if ($its_an_image == 0)
298 {
299 &its_a_link($data[$i], $outputdir);
300 }
301
302 #----------- the link IS an image ----------------
303 if ($its_an_image != 0)
304 {
305 &its_an_image($data[$i], $finaldir);
306 }
307 }
308 }
309
310 # save the web page to disk (in the appropriate numbered directory)
311 $kommand = $lynx.' -dump -source "'.$address.'"';
312
313 if ($linknumber % $dir_entries == 0)
314 {
315 if ((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber))
316 {
317 print " ** ",$outputdir.$linknumber, " - Directory allready exists.\n";
318 }
319 else
320 {
321 mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
322 mkdir($finaldir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
323 }
324 $numberdir = $linknumber;
325 }
326 my $target = $outputdir.$numberdir."/".$linknumber.".html";
327
328 #---------------------------------------------------------------------------------------------------------------
329 # NOTE: This next command will NOT work under win32/dos, as redirecting standard output in a script causes it to
330 # be dumped straight to the screen as opposed to into the target file.
331 #---------------------------------------------------------------------------------------------------------------
332 system ("$kommand > $target");
333 #---------------------------------------------------------------------------------------------------------------
334
335 print " Saved $target\n";
336
337 $linknumber++;
338
339 $address = $linkz_list[$linknumber];
340}
341
342my $end_time = (times)[0];
343
344print "\n\n\n *----------------------------*\n";
345print " | Whew! Task completed! :-D |\n";
346print " *----------------------------*\n";
347printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
348print "\n\n";
349print " Now execute the process_html.pl script to link the downloaded collection together.\n";
350print " Please do make sure that it is executed with the same options as this script ;-)\n";
351
352sub its_a_link
353{
354 local($found) = @_;
355# local($ok = 0, $kommand);
356 local($kommand);
357 local $short_link = "";
358
359 return if ($found =~ /\#.*$/);
360
361 # attach the custom options
362 $found .= $option;
363
364 #little bit of trickery here - check if there is a &d= option present in the link
365 #if there is, then wipe the &cl= option!
366 #This should cut down multiple copies by 75%!!
367
368 #but, if there is no &d option, and the &cl option is not set, then we have to set the &cl option to something
369 #otherwise we get pages which contain no data :\
370
371 if ($found =~ /[&\?]a=d/) {
372 if ($found =~ /[&\?]d=/) {
373 $found =~ s/[&\?]cl=[^&]*//;
374 } elsif ($found !~ /[&\?]cl=/) {
375 $found .= $fix_empty_pages;
376 }
377 }
378
379 # we also want to sort out any xxx.pr OIDs that we come across
380 $found =~ s/([&\?](cl|d)=.*?)\.\d+\.pr/$1/g;
381
382 # attach the EOL character.
383 $found = $found."\n";
384
385
386 # the hard way !!!
387# for my $search(0 .. (length($found) - 3))
388# {
389# if((substr($found, $search, 3) eq '?d=')||(substr($found, $search, 3) eq '&d='))
390# {
391# for my $second_search(0 .. (length($found) - 4))
392# {
393# if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
394# {
395# for my $third_search(($second_search + 3) .. (length($found) - 1))
396# {
397# if((substr($found, $third_search, 1)) eq '&')
398# {
399# substr($found, $second_search, $third_search - $second_search) = "";
400# last;
401# }
402# }
403# last;
404# }
405# }
406# last;
407# }
408# else
409# {
410# if( $search == (length($found) - 3))
411# {
412# for my $second_search(0 .. (length($found) - 4))
413# {
414# if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
415# {
416# for my $third_search(($second_search + 3) .. (length($found) - 1))
417# {
418# if((substr($found, $third_search, 1)) eq '&')
419# {
420# if (substr($found, $second_search, $third_search - $second_search) eq '&cl=')
421# {
422# substr($found, $second_search, $third_search - $second_search) = $fix_empty_pages;
423# }
424# last;
425# }
426# }
427# last;
428# }
429# }
430# }
431# }
432# }
433
434 # grab the last part of the link (ignoring the start and the &e option)
435# for my $search(0 .. (length($found) - 3))
436# {
437# if((substr($found, $search, 3) eq '?a=')||(substr($found, $search, 3) eq '&a='))
438# {
439# $short_link = substr($found, $search);
440# last;
441# }
442# }
443
444 ($short_link) = $found =~ /\?(.*)$/;
445 $short_link =~ s/(^|&)e=[^&]*/$1/;
446
447
448 # this filters out multiple copies of for example the help page, which has #something at the end of its links
449 # now do this first with regular expression above -- Stefan
450
451# for my $search(0 .. length($found))
452# {
453# if ((substr($found, $search, 1)) eq '#')
454# {
455# $ok++;
456# last;
457# }
458# }
459
460
461
462 # compare the found link to the links we've stored in the arrays (compares both full link and partial link)
463 for my $search(0 .. $#linkz_list)
464 {
465 return if ($found eq $linkz_list[$search]);
466 return if ($short_link eq $short_linkz_list[$search]);
467 }
468
469 # if found link is not in links array, add it
470 open (DUMP, ">>links.txt") or die " ** Can't open links.txt!: $!\n";
471 print DUMP $found;
472 close(DUMP);
473
474 $linkz_list[$linkz_pointer] = $found;
475 $short_linkz_list[$linkz_pointer] = $short_link;
476 $linkz_pointer++;
477}
478
479sub do_image_dirs
480{
481 local($found) = @_;
482 my $count = 0;
483 my @br_index;
484 my $image_dir = "";
485 my $new_dir = 0;
486
487 for my $search(1 .. (length($found) - 1 ))
488 {
489 $bracket = substr($found, ($search * - 1), 1);
490 if ($bracket eq '/')
491 {
492 $count++;
493 $br_index[$count] = $search;
494 }
495 if($count == 2)
496 {
497 $image_dir = substr($found, ($br_index[2] * -1) , ($br_index[2] - $br_index[1]));
498 }
499 }
500
501 my $dirs_to_wipe = substr($found, $br_index[$#br_index - 2] * - 1, $br_index[$#br_index - 2] - $br_index[2] + 1)."\n";
502
503 for my $counter(0 .. $#image_dirs_list)
504 {
505 if($dirs_to_wipe eq $image_dirs_list[$counter])
506 {
507 $new_dir++;
508 }
509 }
510
511 if ($new_dir == 0)
512 {
513 open (IMAGE_DIRS, ">>image_dirs.txt") or die " ** Can't open image_dirs.txt!: $!\n";
514 print IMAGE_DIRS $dirs_to_wipe;
515 close(IMAGE_DIRS);
516 $image_dirs_list[$image_dirs_pointer] = $dirs_to_wipe;
517 $image_dirs_pointer++;
518 }
519
520 print " ",substr($finaldir, 0 ,length($finaldir) - 1).$image_dir.substr($found, ($br_index[1] * - 1), length($found) - (length($found) - $br_index[1])),"\n";
521
522 return $image_dir;
523}
524
525sub its_an_image
526{
527 local($found, $outpdir) = @_;
528 local($kommand);
529 my $new = 0;
530
531 my $temp_found = $found . "\n";
532
533 # check if the image is in the list
534 for my $counter(0 .. $#image_list)
535 {
536 if($temp_found eq $image_list[$counter])
537 {
538 $new++;
539 }
540 }
541
542 # only download the image if its not in the list..
543 if($new == 0)
544 {
545 my $image_dir = &do_image_dirs;
546 my $temp_outputdir = $outpdir;
547 if (substr($temp_outputdir, -1, 1) eq "/")
548 {
549 substr($temp_outputdir, -1, 1) = "";
550 }
551
552 # wget is set to 'q - quiet' and 'nc - dont clobber existing files'
553 $kommand = $wget.' -qnc --directory-prefix='.$temp_outputdir.$image_dir.' "'.$found.'"';
554 system ("$kommand");
555
556 open (IMAGES, ">>images.txt") or die " ** Can't open images.txt!: $!\n";
557 print IMAGES $temp_found;
558 close(IMAGES);
559
560 $image_list[$image_pointer] = $temp_found;
561 $image_pointer++;
562
563 # grab corresponding ON pictures for navigation bar if we've just dl-ed the OFF picture
564 if(substr($found , -6) eq "of.gif")
565 {
566 substr($found, -6, 6) = "on.gif";
567 &its_an_image($found, $outpdir);
568 }
569 }
570}
Note: See TracBrowser for help on using the repository browser.