1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | use util;
|
---|
4 |
|
---|
5 | # Both this script and its associated process_html.pl were written by
|
---|
6 | # Marcel ?, while a student at Waikato University. Unfortunately he
|
---|
7 | # was very new to perl at the time so the code is neither as clean nor
|
---|
8 | # as fast as it could be (I've cleaned up a few of the more serious
|
---|
9 | # bottlenecks -- it could do with alot more work though). The code
|
---|
10 | # does work though, if a little slowly. It's not ready for primetime
|
---|
11 | # however and is included in the Greenstone source tree mostly so that
|
---|
12 | # I don't lose it. -- Stefan - 24 Jul 2001
|
---|
13 |
|
---|
14 |
|
---|
15 | # This script will download an entire collection (and its associated pictures and files)
|
---|
16 | # and store them in a temporary directory ($outputdir).
|
---|
17 | # A second script (process_html.pl) can be then be used to 'rebuild' the collection and link all the
|
---|
18 | # downloaded pages and pictures together into a usable static collection.
|
---|
19 |
|
---|
20 | # This script will generate a number of text files required by the second script
|
---|
21 | # and for possible recovery, namely:
|
---|
22 | # - links.txt - contains all the http links that were downloaded
|
---|
23 | # - images.txt - contains a list of all the images that were downloaded
|
---|
24 | # - image_dirs.txt - contains a list of all the image-prefixes that need to be wiped from the html files
|
---|
25 | # (otherwise you won't get to see any pictures)
|
---|
26 |
|
---|
27 | # Both this script and the html processing script have a recovery feature built in, they can continue from wherever
|
---|
28 | # they left off, but this only works if $outputdir and $finaldir are set to different values.
|
---|
29 |
|
---|
30 | # This is where all the downloaded html files end up, eg. 'temp_html/'
|
---|
31 | my $outputdir = 'temp_html/';
|
---|
32 | # This is where all the processed html files (and the pictures) will end up eg. 'my_static_collection/'
|
---|
33 | my $finaldir = 'envl_collection/';
|
---|
34 |
|
---|
35 | # This is where we start our mirroring
|
---|
36 | $address = 'http://nowhere.com/cgi-bin/library?a=p&p=about&c=demo&u=1';
|
---|
37 |
|
---|
38 |
|
---|
39 | # whatever is specified in $option will be attached to the end of each html link before it is downloaded.
|
---|
40 | # eg. "&u=1" to disable various features in the greenstone collections that are not needed with a static
|
---|
41 | # collection.
|
---|
42 | # another example : "&l=nl" to set the entire collection to dutch (NetherLands :)
|
---|
43 | my $option = "&u=1";
|
---|
44 |
|
---|
45 | # Most OS have a limit on the maximum amount of files per directory (or folder).
|
---|
46 | # A static collection can easily contain >3000 html files. Putting all those files
|
---|
47 | # into one single directory is just asking for trouble. It's also very unwieldy ;^)
|
---|
48 | # Hence...this value will set how much html files will be stored in one directory.
|
---|
49 | # These directories themselves will be numbered,
|
---|
50 | # so if $dir_entries = 500 then the directories will be "0/", "500/", "1000/", "1500/", "2000/", etc.
|
---|
51 | my $dir_entries = 250;
|
---|
52 |
|
---|
53 | # Occasionally a page occurs which contains no data (because &cl is not set) This option fixes that.
|
---|
54 | my $fix_empty_pages = "&cl=CL1";
|
---|
55 |
|
---|
56 | # These are the files that wget will download.
|
---|
57 | # more can be added if necessary.
|
---|
58 | my @graphic_formats = ('.gif','.jpg','.bmp','.png','.pdf','.mov','.mpeg','.jpeg','.rm');
|
---|
59 |
|
---|
60 | # ---------------------------------------[ System specific options ]----------------------------------------------------
|
---|
61 |
|
---|
62 | # The lynx variable specifies the command line for the lynx web browser
|
---|
63 | # -- This is what I use under dos/win32
|
---|
64 | # my $lynx = 'e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg';
|
---|
65 |
|
---|
66 | # -- This is what I use under linux
|
---|
67 | my $lynx = 'lynx';
|
---|
68 |
|
---|
69 | # and the same for the wget utility
|
---|
70 | my $wget = 'wget';
|
---|
71 |
|
---|
72 | # NB: There is one other linux specific command all the way at the end of this script, where I've used 'cp' to copy a file.
|
---|
73 |
|
---|
74 | # Another NB: When saving the dl-ed html files to disk, I've set lynx to dump the html-source to the standard output,
|
---|
75 | # which I then simply redirect to a target file, BUT
|
---|
76 | # this does not work under DOS/win32. Redirecting standard output in a script causes it to be displayed on
|
---|
77 | # the screen instead. The easiest way to get around this I found was by doing the actual redirection in a simple
|
---|
78 | # batch file (say grab.bat), which contains the following line:
|
---|
79 | # @e:\lynx_w32\lynx -cfg=e:\lynx_w32\lynx.cfg -dump -source "%1" > %2
|
---|
80 | #
|
---|
81 | # Then replace line nr 326 -> 'system ("$kommand > $target");' with 'system("grab.bat $address $target");'
|
---|
82 | # Not a very elegant solution, but it works :)
|
---|
83 |
|
---|
84 | #------------------------------------------------------------------------------------------------------------------------
|
---|
85 |
|
---|
86 | my %image_list;
|
---|
87 | my $image_pointer = 0;
|
---|
88 |
|
---|
89 | my %linkz_list;
|
---|
90 | my %short_linkz_list;
|
---|
91 | my $linkz_pointer = 0;
|
---|
92 |
|
---|
93 | my %image_dirs_list;
|
---|
94 | my $image_dirs_pointer = 0;
|
---|
95 |
|
---|
96 | my $numberdir = 0;
|
---|
97 |
|
---|
98 | my $start_time = (times)[0];
|
---|
99 |
|
---|
100 | # check if directories exist and create them if necessary..
|
---|
101 | if ((-e $outputdir)&&(-d $outputdir))
|
---|
102 | {
|
---|
103 | print " ** ",$outputdir," directory already exists..\n";
|
---|
104 | }
|
---|
105 | else
|
---|
106 | {
|
---|
107 | print " ** Creating ",$outputdir," directory..\n";
|
---|
108 | mkdir($outputdir, 0777) or die " Cannot create output directory: $!\n";
|
---|
109 | }
|
---|
110 |
|
---|
111 | if ((-e $finaldir)&&(-d $finaldir))
|
---|
112 | {
|
---|
113 | print " ** ",$finaldir," directory already exists..\n";
|
---|
114 | }
|
---|
115 | else
|
---|
116 | {
|
---|
117 | print " ** Creating ",$finaldir," directory..\n";
|
---|
118 | mkdir($finaldir, 0777) or die " Cannot create final directory: $!\n";
|
---|
119 | }
|
---|
120 |
|
---|
121 | #-----------------------------------------------------------------------------------------------
|
---|
122 | # No need to start from scratch everytime, we can recover/continue from wherever we left off
|
---|
123 | # simply by checking which html files have been created
|
---|
124 | #-----------------------------------------------------------------------------------------------
|
---|
125 |
|
---|
126 | $linknumber = 0; # used to name/number the dl-ed html files
|
---|
127 |
|
---|
128 | my $failed = 0;
|
---|
129 | while ($failed == 0)
|
---|
130 | {
|
---|
131 | if ($linknumber % $dir_entries == 0)
|
---|
132 | {
|
---|
133 | if (!((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber)))
|
---|
134 | {
|
---|
135 | $failed++;
|
---|
136 | mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
|
---|
137 | }
|
---|
138 | $numberdir = $linknumber;
|
---|
139 | }
|
---|
140 |
|
---|
141 | $check_file = $outputdir.$numberdir."/".$linknumber.".html";
|
---|
142 | if ((-e $check_file)&&($failed == 0))
|
---|
143 | {
|
---|
144 | $linknumber++;
|
---|
145 | }
|
---|
146 | else
|
---|
147 | {
|
---|
148 | $failed++;
|
---|
149 | # I'm subtracting 1 from the starting link,
|
---|
150 | # just in case it only loaded half the page ;^)
|
---|
151 | if($linknumber>0)
|
---|
152 | {
|
---|
153 | $linknumber--;
|
---|
154 | }
|
---|
155 | print " Will start downloading at number $linknumber \n";
|
---|
156 | }
|
---|
157 | }
|
---|
158 |
|
---|
159 | # if we're starting from scratch, then we might as well nuke the links file
|
---|
160 | #if ($linknumber == 0)
|
---|
161 | #{
|
---|
162 | # print " Starting from scratch - clobbering the old text files...\n";
|
---|
163 | # if (-e 'links.txt')
|
---|
164 | # {
|
---|
165 | # print " Removing links.txt...\n";
|
---|
166 | # unlink <links.txt> or print " ** Cannot delete links textfile: $!\n";
|
---|
167 | # }
|
---|
168 | # if (-e 'images.txt')
|
---|
169 | # {
|
---|
170 | # print " Removing images.txt...\n";
|
---|
171 | # unlink <images.txt> or print " ** Cannot delete images textfile: $!\n";
|
---|
172 | # }
|
---|
173 | # if (-e 'image_dirs.txt')
|
---|
174 | # {
|
---|
175 | # print " Removing image_dirs.txt...\n";
|
---|
176 | # unlink <image_dirs.txt> or print " ** Cannot delete image_dirs textfile: $!\n";
|
---|
177 | # }
|
---|
178 | #}
|
---|
179 |
|
---|
180 | # if we're NOT starting from scratch, then read in old links from links text file
|
---|
181 | # and grab the old image-links as well...
|
---|
182 | if ($linknumber != 0)
|
---|
183 | {
|
---|
184 | # load the old links from links.txt, if it doesn't exist, then give up :(
|
---|
185 | my $this = "";
|
---|
186 | my $that = "";
|
---|
187 | open (CHECK, "links.txt") or die " ** Cannot find/open links.txt file!: $! **\n";
|
---|
188 | while(eof CHECK == 0)
|
---|
189 | {
|
---|
190 | while($this ne "\n")
|
---|
191 | {
|
---|
192 | read CHECK, $this ,1;
|
---|
193 | $that = $that.$this;
|
---|
194 | }
|
---|
195 | $linkz_list[$linkz_pointer] = $that;
|
---|
196 |
|
---|
197 | for my $search(0 .. (length($that) - 3))
|
---|
198 | {
|
---|
199 | if((substr($that, $search, 3) eq '?a=')||(substr($that, $search, 3) eq '&a='))
|
---|
200 | {
|
---|
201 | $short_linkz_list[$linkz_pointer] = substr($that, $search);
|
---|
202 | last;
|
---|
203 | }
|
---|
204 | }
|
---|
205 | $linkz_pointer++;
|
---|
206 | $that = ""; $this = "";
|
---|
207 | }
|
---|
208 | close(CHECK);
|
---|
209 | print "- I found ",($#linkz_list + 1)," links in links.txt -\n";
|
---|
210 |
|
---|
211 | #make sure that we start dl-ing the correct first page
|
---|
212 | $address = $linkz_list[$linknumber];
|
---|
213 |
|
---|
214 | # load the old image links from image.txt (if it doesn't exist, no big deal ;)
|
---|
215 | my $im_this = "";
|
---|
216 | my $im_that = "";
|
---|
217 | open (IMAGES, "images.txt") || print " ** Cannot find/open images.txt file! : $! **\n";
|
---|
218 | while(eof IMAGES == 0)
|
---|
219 | {
|
---|
220 | while($im_this ne "\n")
|
---|
221 | {
|
---|
222 | read IMAGES, $im_this ,1;
|
---|
223 | $im_that = $im_that.$im_this;
|
---|
224 | }
|
---|
225 | $image_list[$image_pointer] = $im_that;
|
---|
226 | $image_pointer++;
|
---|
227 | $im_that = ""; $im_this = "";
|
---|
228 | }
|
---|
229 | close(IMAGES);
|
---|
230 | print "- I found ",($#image_list + 1)," picture-links in images.txt -\n";
|
---|
231 |
|
---|
232 | #..and last but not least, load any image_dirs from image_dirs.txt
|
---|
233 | # again, if its not there, no big deal :)
|
---|
234 | my $imd_this = "";
|
---|
235 | my $imd_that = "";
|
---|
236 | open (IMAGE_DIR, "image_dirs.txt") || print " ** Cannot find/open image_dirs.txt file!: $! **\n";
|
---|
237 | while(eof IMAGE_DIR == 0)
|
---|
238 | {
|
---|
239 | while($imd_this ne "\n")
|
---|
240 | {
|
---|
241 | read IMAGE_DIR, $imd_this ,1;
|
---|
242 | $imd_that = $imd_that.$imd_this;
|
---|
243 | }
|
---|
244 | $image_dirs_list[$image_dirs_pointer] = $imd_that;
|
---|
245 | $image_dirs_pointer++;
|
---|
246 | $imd_that = ""; $imd_this = "";
|
---|
247 | }
|
---|
248 | close(IMAGE_DIR);
|
---|
249 | print "- I found ",($#image_dirs_list + 1)," picture directories in image_dirs.txt -\n";
|
---|
250 | }
|
---|
251 |
|
---|
252 | # Just keep going till we can find no more new links
|
---|
253 | while(($#linkz_list < 0)||($#linkz_list+1 > $linknumber))
|
---|
254 | {
|
---|
255 |
|
---|
256 | # This line specifies the command line for the lynx web browser
|
---|
257 | my $kommand = $lynx.' -dump -image_links "'.$address.'"';
|
---|
258 |
|
---|
259 | # dump page into text-array and find starting-point of the references/links
|
---|
260 | chomp(@data=`$kommand`);
|
---|
261 | for my $i(0 .. $#data)
|
---|
262 | {
|
---|
263 | if ($data[$i] eq "References") {
|
---|
264 | $here = $i;}
|
---|
265 | }
|
---|
266 | $here = $here+2;
|
---|
267 |
|
---|
268 | # process references/links
|
---|
269 | for $i($here .. $#data){
|
---|
270 |
|
---|
271 | $its_an_image = 0;
|
---|
272 |
|
---|
273 | #chop-off refs leading number&spaces (eg. '1. http://www.cs.waikato.ac.nz')
|
---|
274 | # ^^^
|
---|
275 |
|
---|
276 | # $temp = substr($data[$i],3);
|
---|
277 | # @temp = split(/ /, $temp, 2);
|
---|
278 |
|
---|
279 | #check if the last 4 characters of the link equal .gif .jpg .png .bmp .pdf .mov .mpeg etc etc
|
---|
280 |
|
---|
281 | # for my $g(0 .. $#graphic_formats)
|
---|
282 | # {
|
---|
283 | # if(substr($temp[1],(length($graphic_formats[$g]) * -1)) eq $graphic_formats[$g])
|
---|
284 | # {
|
---|
285 | # $its_an_image = 1;
|
---|
286 | # }
|
---|
287 | # }
|
---|
288 |
|
---|
289 | $data[$i] =~ s/^\s*\d+\.\s+//;
|
---|
290 | if ($data[$i] =~ /\.(gif|jpe?g|png|bmp|pdf|mov|mpe?g|rm)$/i) {
|
---|
291 | $its_an_image = 1;
|
---|
292 | }
|
---|
293 |
|
---|
294 | # ignore mailto urls
|
---|
295 | if ($data[$i] !~ /mailto:/i) {
|
---|
296 |
|
---|
297 | #----------- the link is NOT an image ----------------
|
---|
298 | if ($its_an_image == 0)
|
---|
299 | {
|
---|
300 | &its_a_link($data[$i], $outputdir);
|
---|
301 | }
|
---|
302 |
|
---|
303 | #----------- the link IS an image ----------------
|
---|
304 | if ($its_an_image != 0)
|
---|
305 | {
|
---|
306 | &its_an_image($data[$i], $finaldir);
|
---|
307 | }
|
---|
308 | }
|
---|
309 | }
|
---|
310 |
|
---|
311 | # save the web page to disk (in the appropriate numbered directory)
|
---|
312 | $kommand = $lynx.' -dump -source "'.$address.'"';
|
---|
313 |
|
---|
314 | if ($linknumber % $dir_entries == 0)
|
---|
315 | {
|
---|
316 | if ((-e $outputdir.$linknumber)&&(-d $outputdir.$linknumber))
|
---|
317 | {
|
---|
318 | print " ** ",$outputdir.$linknumber, " - Directory allready exists.\n";
|
---|
319 | }
|
---|
320 | else
|
---|
321 | {
|
---|
322 | mkdir($outputdir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
|
---|
323 | mkdir($finaldir.$linknumber, 0777) or print " ** Cannot create ",$outputdir.$linknumber, "!: $!\n";
|
---|
324 | }
|
---|
325 | $numberdir = $linknumber;
|
---|
326 | }
|
---|
327 | my $target = $outputdir.$numberdir."/".$linknumber.".html";
|
---|
328 |
|
---|
329 | #---------------------------------------------------------------------------------------------------------------
|
---|
330 | # NOTE: This next command will NOT work under win32/dos, as redirecting standard output in a script causes it to
|
---|
331 | # be dumped straight to the screen as opposed to into the target file.
|
---|
332 | #---------------------------------------------------------------------------------------------------------------
|
---|
333 | system ("$kommand > $target");
|
---|
334 | #---------------------------------------------------------------------------------------------------------------
|
---|
335 |
|
---|
336 | print " Saved $target\n";
|
---|
337 |
|
---|
338 | $linknumber++;
|
---|
339 |
|
---|
340 | $address = $linkz_list[$linknumber];
|
---|
341 | }
|
---|
342 |
|
---|
343 | my $end_time = (times)[0];
|
---|
344 |
|
---|
345 | print "\n\n\n *----------------------------*\n";
|
---|
346 | print " | Whew! Task completed! :-D |\n";
|
---|
347 | print " *----------------------------*\n";
|
---|
348 | printf" Script took %.2f CPU seconds to complete ;^)\n", $end_time - $start_time;
|
---|
349 | print "\n\n";
|
---|
350 | print " Now execute the process_html.pl script to link the downloaded collection together.\n";
|
---|
351 | print " Please do make sure that it is executed with the same options as this script ;-)\n";
|
---|
352 |
|
---|
353 | sub its_a_link
|
---|
354 | {
|
---|
355 | local($found) = @_;
|
---|
356 | # local($ok = 0, $kommand);
|
---|
357 | local($kommand);
|
---|
358 | local $short_link = "";
|
---|
359 |
|
---|
360 | return if ($found =~ /\#.*$/);
|
---|
361 |
|
---|
362 | # attach the custom options
|
---|
363 | $found .= $option;
|
---|
364 |
|
---|
365 | #little bit of trickery here - check if there is a &d= option present in the link
|
---|
366 | #if there is, then wipe the &cl= option!
|
---|
367 | #This should cut down multiple copies by 75%!!
|
---|
368 |
|
---|
369 | #but, if there is no &d option, and the &cl option is not set, then we have to set the &cl option to something
|
---|
370 | #otherwise we get pages which contain no data :\
|
---|
371 |
|
---|
372 | if ($found =~ /[&\?]a=d/) {
|
---|
373 | if ($found =~ /[&\?]d=/) {
|
---|
374 | $found =~ s/[&\?]cl=[^&]*//;
|
---|
375 | } elsif ($found !~ /[&\?]cl=/) {
|
---|
376 | $found .= $fix_empty_pages;
|
---|
377 | }
|
---|
378 | }
|
---|
379 |
|
---|
380 | # we also want to sort out any xxx.pr OIDs that we come across
|
---|
381 | $found =~ s/([&\?](cl|d)=.*?)\.\d+\.pr/$1/g;
|
---|
382 |
|
---|
383 | # attach the EOL character.
|
---|
384 | $found = $found."\n";
|
---|
385 |
|
---|
386 |
|
---|
387 | # the hard way !!!
|
---|
388 | # for my $search(0 .. (length($found) - 3))
|
---|
389 | # {
|
---|
390 | # if((substr($found, $search, 3) eq '?d=')||(substr($found, $search, 3) eq '&d='))
|
---|
391 | # {
|
---|
392 | # for my $second_search(0 .. (length($found) - 4))
|
---|
393 | # {
|
---|
394 | # if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
|
---|
395 | # {
|
---|
396 | # for my $third_search(($second_search + 3) .. (length($found) - 1))
|
---|
397 | # {
|
---|
398 | # if((substr($found, $third_search, 1)) eq '&')
|
---|
399 | # {
|
---|
400 | # substr($found, $second_search, $third_search - $second_search) = "";
|
---|
401 | # last;
|
---|
402 | # }
|
---|
403 | # }
|
---|
404 | # last;
|
---|
405 | # }
|
---|
406 | # }
|
---|
407 | # last;
|
---|
408 | # }
|
---|
409 | # else
|
---|
410 | # {
|
---|
411 | # if( $search == (length($found) - 3))
|
---|
412 | # {
|
---|
413 | # for my $second_search(0 .. (length($found) - 4))
|
---|
414 | # {
|
---|
415 | # if((substr($found, $second_search, 4) eq '?cl=')||(substr($found, $second_search, 4) eq '&cl='))
|
---|
416 | # {
|
---|
417 | # for my $third_search(($second_search + 3) .. (length($found) - 1))
|
---|
418 | # {
|
---|
419 | # if((substr($found, $third_search, 1)) eq '&')
|
---|
420 | # {
|
---|
421 | # if (substr($found, $second_search, $third_search - $second_search) eq '&cl=')
|
---|
422 | # {
|
---|
423 | # substr($found, $second_search, $third_search - $second_search) = $fix_empty_pages;
|
---|
424 | # }
|
---|
425 | # last;
|
---|
426 | # }
|
---|
427 | # }
|
---|
428 | # last;
|
---|
429 | # }
|
---|
430 | # }
|
---|
431 | # }
|
---|
432 | # }
|
---|
433 | # }
|
---|
434 |
|
---|
435 | # grab the last part of the link (ignoring the start and the &e option)
|
---|
436 | # for my $search(0 .. (length($found) - 3))
|
---|
437 | # {
|
---|
438 | # if((substr($found, $search, 3) eq '?a=')||(substr($found, $search, 3) eq '&a='))
|
---|
439 | # {
|
---|
440 | # $short_link = substr($found, $search);
|
---|
441 | # last;
|
---|
442 | # }
|
---|
443 | # }
|
---|
444 |
|
---|
445 | ($short_link) = $found =~ /\?(.*)$/;
|
---|
446 | $short_link =~ s/(^|&)e=[^&]*/$1/;
|
---|
447 |
|
---|
448 |
|
---|
449 | # this filters out multiple copies of for example the help page, which has #something at the end of its links
|
---|
450 | # now do this first with regular expression above -- Stefan
|
---|
451 |
|
---|
452 | # for my $search(0 .. length($found))
|
---|
453 | # {
|
---|
454 | # if ((substr($found, $search, 1)) eq '#')
|
---|
455 | # {
|
---|
456 | # $ok++;
|
---|
457 | # last;
|
---|
458 | # }
|
---|
459 | # }
|
---|
460 |
|
---|
461 |
|
---|
462 |
|
---|
463 | # compare the found link to the links we've stored in the arrays (compares both full link and partial link)
|
---|
464 | for my $search(0 .. $#linkz_list)
|
---|
465 | {
|
---|
466 | return if ($found eq $linkz_list[$search]);
|
---|
467 | return if ($short_link eq $short_linkz_list[$search]);
|
---|
468 | }
|
---|
469 |
|
---|
470 | # if found link is not in links array, add it
|
---|
471 | open (DUMP, ">>links.txt") or die " ** Can't open links.txt!: $!\n";
|
---|
472 | print DUMP $found;
|
---|
473 | close(DUMP);
|
---|
474 |
|
---|
475 | $linkz_list[$linkz_pointer] = $found;
|
---|
476 | $short_linkz_list[$linkz_pointer] = $short_link;
|
---|
477 | $linkz_pointer++;
|
---|
478 | }
|
---|
479 |
|
---|
480 | sub do_image_dirs
|
---|
481 | {
|
---|
482 | local($found) = @_;
|
---|
483 | my $count = 0;
|
---|
484 | my @br_index;
|
---|
485 | my $image_dir = "";
|
---|
486 | my $new_dir = 0;
|
---|
487 |
|
---|
488 | for my $search(1 .. (length($found) - 1 ))
|
---|
489 | {
|
---|
490 | $bracket = substr($found, ($search * - 1), 1);
|
---|
491 | if ($bracket eq '/')
|
---|
492 | {
|
---|
493 | $count++;
|
---|
494 | $br_index[$count] = $search;
|
---|
495 | }
|
---|
496 | if($count == 2)
|
---|
497 | {
|
---|
498 | $image_dir = substr($found, ($br_index[2] * -1) , ($br_index[2] - $br_index[1]));
|
---|
499 | }
|
---|
500 | }
|
---|
501 |
|
---|
502 | my $dirs_to_wipe = substr($found, $br_index[$#br_index - 2] * - 1, $br_index[$#br_index - 2] - $br_index[2] + 1)."\n";
|
---|
503 |
|
---|
504 | for my $counter(0 .. $#image_dirs_list)
|
---|
505 | {
|
---|
506 | if($dirs_to_wipe eq $image_dirs_list[$counter])
|
---|
507 | {
|
---|
508 | $new_dir++;
|
---|
509 | }
|
---|
510 | }
|
---|
511 |
|
---|
512 | if ($new_dir == 0)
|
---|
513 | {
|
---|
514 | open (IMAGE_DIRS, ">>image_dirs.txt") or die " ** Can't open image_dirs.txt!: $!\n";
|
---|
515 | print IMAGE_DIRS $dirs_to_wipe;
|
---|
516 | close(IMAGE_DIRS);
|
---|
517 | $image_dirs_list[$image_dirs_pointer] = $dirs_to_wipe;
|
---|
518 | $image_dirs_pointer++;
|
---|
519 | }
|
---|
520 |
|
---|
521 | print " ",substr($finaldir, 0 ,length($finaldir) - 1).$image_dir.substr($found, ($br_index[1] * - 1), length($found) - (length($found) - $br_index[1])),"\n";
|
---|
522 |
|
---|
523 | return $image_dir;
|
---|
524 | }
|
---|
525 |
|
---|
526 | sub its_an_image
|
---|
527 | {
|
---|
528 | local($found, $outpdir) = @_;
|
---|
529 | local($kommand);
|
---|
530 | my $new = 0;
|
---|
531 |
|
---|
532 | my $temp_found = $found . "\n";
|
---|
533 |
|
---|
534 | # check if the image is in the list
|
---|
535 | for my $counter(0 .. $#image_list)
|
---|
536 | {
|
---|
537 | if($temp_found eq $image_list[$counter])
|
---|
538 | {
|
---|
539 | $new++;
|
---|
540 | }
|
---|
541 | }
|
---|
542 |
|
---|
543 | # only download the image if its not in the list..
|
---|
544 | if($new == 0)
|
---|
545 | {
|
---|
546 | my $image_dir = &do_image_dirs;
|
---|
547 | my $temp_outputdir = $outpdir;
|
---|
548 | if (substr($temp_outputdir, -1, 1) eq "/")
|
---|
549 | {
|
---|
550 | substr($temp_outputdir, -1, 1) = "";
|
---|
551 | }
|
---|
552 |
|
---|
553 | # the wget binary is dependent on the gnomelib_env (particularly lib/libiconv2.dylib) being set, particularly on Mac Lions (android too?)
|
---|
554 | &util::set_gnomelib_env(); # this will set the gnomelib env once for each subshell launched, by first checking if GEXTGNOME is not already set
|
---|
555 |
|
---|
556 | # wget is set to 'q - quiet' and 'nc - dont clobber existing files'
|
---|
557 | $kommand = $wget.' -qnc --directory-prefix='.$temp_outputdir.$image_dir.' "'.$found.'"';
|
---|
558 | system ("$kommand");
|
---|
559 |
|
---|
560 | open (IMAGES, ">>images.txt") or die " ** Can't open images.txt!: $!\n";
|
---|
561 | print IMAGES $temp_found;
|
---|
562 | close(IMAGES);
|
---|
563 |
|
---|
564 | $image_list[$image_pointer] = $temp_found;
|
---|
565 | $image_pointer++;
|
---|
566 |
|
---|
567 | # grab corresponding ON pictures for navigation bar if we've just dl-ed the OFF picture
|
---|
568 | if(substr($found , -6) eq "of.gif")
|
---|
569 | {
|
---|
570 | substr($found, -6, 6) = "on.gif";
|
---|
571 | &its_an_image($found, $outpdir);
|
---|
572 | }
|
---|
573 | }
|
---|
574 | }
|
---|