source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 17549

Last change on this file since 17549 was 17549, checked in by ak19, 16 years ago

Changes to sudden wget download termination when OAIDownload.pm is used: OAIDownload.pm launches wget several times, one after another (after each previous wget instance has terminated). Therefore when the STOP signal is sent from GLI, the OAIDownload.pm script should stop altogether and issue no more calls to wget. This is now accomplished by checking the new member variable self->force_quit which is set to true when WgetDownload receives a STOP signal from GLI.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
64 'disp' => "{OAIDownload.get_doc_disp}",
65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "get_doc_exts",
69 'disp' => "{OAIDownload.get_doc_exts_disp}",
70 'desc' => "{OAIDownload.get_doc_exts}",
71 'type' => "string",
72 'deft' => "doc,pdf,ppt",
73 'reqd' => "no"},
74 { 'name' => "max_records",
75 'disp' => "{OAIDownload.max_records_disp}",
76 'desc' => "{OAIDownload.max_records}",
77 'type' => "int",
78 'deft' => "500",
79 'range' => "1,",
80 'reqd' => "no"} ];
81
82my $options = { 'name' => "OAIDownload",
83 'desc' => "{OAIDownload.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88##my $self;
89
90#### my $strWgetOptions="";
91
92sub new
93{
94 my ($class) = shift (@_);
95 my ($getlist,$inputargs,$hashArgOptLists) = @_;
96 push(@$getlist, $class);
97
98 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99 push(@{$hashArgOptLists->{"OptList"}},$options);
100
101 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
102
103 if ($self->{'info_only'}) {
104 # don't worry about any options etc
105 return bless $self, $class;
106 }
107
108 my $parser = new XML::Parser('Style' => 'Stream',
109 'PluginObj' => $self,
110 'Handlers' => {'Char' => \&Char,
111 'Start' => \&OAI_StartTag,
112 'End' => \&OAI_EndTag
113 });
114 $self->{'parser'} = $parser;
115
116 # make sure the tmp directory that we will use later exists
117 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
118 if (! -e $tmp_dir) {
119 &util::mk_dir($tmp_dir);
120 }
121
122
123 # set up hashmap for individual items in get_doc_exts
124 # to make testing for matches easier
125
126 $self->{'lookup_exts'} = {};
127 my $get_doc_exts = $self->{'get_doc_exts'};
128
129 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
130 my @exts = split(/,\s*/,$get_doc_exts);
131 foreach my $e (@exts) {
132 $self->{'lookup_exts'}->{lc($e)} = 1;
133 }
134 }
135
136
137 return bless $self, $class;
138}
139
140sub download
141{
142 my ($self) = shift (@_);
143 my ($hashGeneralOptions) = @_;
144
145## my $cmdWget = $strWgetOptions;
146
147 my $strOutputDir ="";
148 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
149 my $strBasURL = $self->{'url'};
150 my $intMaxRecords = $self->{'max_records'};
151 my $blnDownloadDoc = $self->{'get_doc'};
152
153 print STDERR "<<Defined Maximum>>\n";
154
155 my $strIDs = $self->getOAIIDs($strBasURL);
156
157 if($strIDs eq "")
158 {
159 print STDERR "Error: No IDs found\n";
160 return 0;
161 }
162
163 my $aryIDs = $self->parseOAIIDs($strIDs);
164 my $intIDs = 0;
165 if($self->{'max_records'} < scalar(@$aryIDs))
166 {
167 $intIDs = $self->{'max_records'};
168 }
169 else
170 {
171 $intIDs = scalar(@$aryIDs);
172 }
173 print STDERR "<<Total number of record(s):$intIDs>>\n";
174
175 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
176
177# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
178# &util::rm($tmp_file);
179
180 return 1;
181}
182
183sub getOAIIDs
184{
185 my ($self,$strBasURL) = @_;
186## my ($cmdWget);
187
188 my $wgetOptions = $self->getWgetOptions();
189
190 my $cmdWget = $wgetOptions;
191
192 print STDERR "Gathering OAI identifiers.....\n";
193
194 my $metadata_prefix = $self->{'metadata_prefix'};
195 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
196
197 # if $set specified, add it in to URL
198 my $set = $self->{'set'};
199 $cmdWget .= "&set=$set" if ($set ne "");
200
201 $cmdWget .= "\" ";
202
203 my $accumulated_strIDs = "";
204 my $strIDs = $self->useWget($cmdWget);
205
206 if (!defined $strIDs or $strIDs eq "" ){
207 print STDERR "Server information is unavailable.\n";
208 print STDERR "<<Finished>>\n";
209 return;
210 }
211 if ($self->{'forced_quit'}) {
212 return $strIDs;
213 }
214
215 print STDERR "<<Download Information>>\n";
216
217 $self->parse_xml($strIDs);
218
219 $accumulated_strIDs = $strIDs;
220
221 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/) {
222 # top up list with further requests for IDs
223
224 my $resumption_token = $1;
225
226 $cmdWget = $wgetOptions;
227
228 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
229
230 $strIDs = $self->useWget($cmdWget);
231 if ($self->{'forced_quit'}) {
232 return $accumulated_strIDs;
233 }
234
235 $self->parse_xml($strIDs);
236
237 $accumulated_strIDs .= $strIDs;
238
239 my @accumulated_identifiers
240 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
241
242 my $num_acc_identifiers = scalar(@accumulated_identifiers);
243 if ($num_acc_identifiers > $self->{'max_records'}) {
244 last;
245 }
246 }
247
248 return $accumulated_strIDs;
249}
250
251sub parseOAIIDs
252{
253 my ($self,$strIDs) = @_;
254
255 print STDERR "Parsing OAI identifiers.....\n";
256 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
257 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
258
259 my @aryIDs = ();
260
261 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
262 {
263 $strIDs = $2;
264 push(@aryIDs,$1);
265 }
266
267 return \@aryIDs;
268}
269
270sub dirFileSplit
271{
272 my ($self,$strFile) = @_;
273
274 my @aryDirs = split("[/\]",$strFile);
275
276 my $strLocalFile = pop(@aryDirs);
277 my $strSubDirs = join("/",@aryDirs);
278
279 return ($strSubDirs,$strLocalFile);
280}
281
282sub getOAIDoc
283{
284 my ($self,$strRecord, $oai_rec_filename) = @_;
285
286 print STDERR "Gathering source documents.....\n";
287 # look out for identifier tag in metadata section
288
289 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
290 {
291 my $strMetaTag = $1;
292 my $had_valid_url = 0;
293
294 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
295 {
296 my $doc_id_url = $2;
297
298 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
299
300 my $orig_doc_id_url = $doc_id_url;
301 $had_valid_url = 1;
302
303 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
304 my $faked_ext = 0;
305 my $primary_doc_match = 0;
306
307 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
308
309 if (defined $id_file_ext) {
310 # cross-check this filename extension with get_doc_exts option
311 # if provided
312 my $lookup_exts = $self->{'lookup_exts'};
313
314 if (defined $lookup_exts->{lc($id_file_ext)}) {
315 # this initial URL matches requirement
316 $primary_doc_match = 1;
317 }
318 }
319 else {
320 $faked_ext = 1;
321 $id_file_ext = "html";
322 }
323
324
325 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
326 # Download this doc if HTML, scan through it looking for a link
327 # that does match get_doc_exts
328
329
330 # 1. Generate a tmp name
331 my $tmp_filename = &util::get_tmp_filename();
332
333 # 2. Download it
334 my $wget_opts2 = $self->getWgetOptions();
335 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
336
337 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
338 return $strRecord if $self->{'forced_quit'};
339
340 if($error2 ne "")
341 {
342 print STDERR "Error occured while retrieving OAI source documents: $error2\n";
343 exit(-1);
344 }
345
346 if (defined $follow2) {
347 # src url was "redirected" to another place
348 # => pick up on this and make it the new doc_id_url
349 $doc_id_url = $follow2;
350 }
351
352 my $primary_doc_html = "";
353 if (open(HIN,"<$tmp_filename")) {
354 my $line;
355 while (defined ($line = <HIN>)) {
356 $primary_doc_html .= $line;
357 }
358 close(HIN);
359
360 # 3. Scan through it looking for match
361 #
362 # if got match, change $doc_id_url to this new URL and
363 # $id_file_ext to 'match'
364
365 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
366
367 my $lookup_exts = $self->{'lookup_exts'};
368
369 foreach my $href (@href_links) {
370 my ($ext) = ($href =~ m/\.([^\.]+)$/);
371
372 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
373
374 if ($href !~ m/^(https?|ftp):\/\//) {
375 # link is within current site
376 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
377
378 $href = "$site_domain$href";
379 }
380
381 $doc_id_url = $href;
382 $id_file_ext = $ext;
383 last;
384 }
385 }
386 }
387 else {
388 print STDERR "Error occurred while retrieving OAI source documents:\n";
389 print STDERR "$!\n";
390 }
391
392 if (-e $tmp_filename) {
393 &util::rm($tmp_filename);
394 }
395 }
396
397 my $download_doc_filename = $oai_rec_filename;
398 $download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
399
400 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
401
402 my $wget_opts = $self->getWgetOptions();
403 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\"";
404
405 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
406 return $strRecord if $self->{'forced_quit'};
407
408 if($errors ne "")
409 {
410 print STDERR "Error occured while retriving OAI souce documents:\n";
411 print STDERR "$errors\n";
412 exit(-1);
413 }
414
415
416 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
417 }
418
419 if (!$had_valid_url)
420 {
421 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
422 }
423 }
424 else
425 {
426 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
427 }
428
429 return $strRecord;
430}
431
432sub getOAIRecords
433{
434 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
435
436 my $intDocCounter = 0;
437
438 my $metadata_prefix = $self->{'metadata_prefix'};
439
440 foreach my $strID ( @$aryIDs)
441 {
442 print STDERR "Gathering OAI record with ID $strID.....\n";
443
444 my $wget_opts = $self->getWgetOptions();
445 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
446
447 my $strRecord = $self->useWget($cmdWget);
448
449 my @fileDirs = split(":",$strID);
450 my $local_id = pop @fileDirs;
451
452 # setup directories
453
454 $strOutputDir =~ s/"//g; #"
455
456 my $host =$self->{'url'};
457
458 $host =~ s/https?:\/\///g;
459
460 $host =~ s/:.*//g;
461
462 my $strFileURL = "$strOutputDir/$host/$local_id.oai";
463
464
465 # prepare subdirectory for record (if needed)
466 my ($strSubDirPath,$unused) = ("", "");
467
468 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
469
470 &util::mk_all_dir($strSubDirPath);
471
472 my $ds = &util::get_dirsep();
473
474 if($blnDownloadDoc)
475 {
476 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
477 }
478
479 # save record
480 open (OAIOUT,">$strFileURL")
481 || die "Unable to save oai metadata record: $!\n";
482 print OAIOUT $strRecord;
483 close(OAIOUT);
484
485 print STDERR "Saving records to $strFileURL\n";
486 print STDERR "<<Done>>\n";
487 $intDocCounter ++;
488 last if ($intDocCounter >= $intMaxRecords);
489 }
490
491 ($intDocCounter >= $intMaxRecords) ?
492 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
493 print STDERR "Complete download meta record from $strBasURL\n";
494
495 print STDERR "<<Finished>>\n";
496}
497
498sub url_information
499{
500 my ($self) = shift (@_);
501 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
502
503 my $wgetOptions = $self->getWgetOptions();
504 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
505
506 my $strIdentify = "verb=Identify";
507 my $strListSets = "verb=ListSets";
508 my $strListMdFormats = "verb=ListMetadataFormats";
509
510 my $strIdentifyCMD = $strBaseCMD;
511 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
512
513 my $strIdentifyText = $self->useWget($strIdentifyCMD);
514
515 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
516 print STDERR "Server information is unavailable.\n";
517 print STDERR "<<Finished>>\n";
518 return;
519 }
520
521 print STDERR "General information:\n";
522 $self->parse_xml($strIdentifyText);
523 print STDERR "\n";
524
525 print STDERR "=" x 10, "\n";
526 print STDERR "Metadata Format Information (metadataPrefix):\n";
527 print STDERR "=" x 10, "\n";
528
529 my $strListMdFormatsCMD = $strBaseCMD;
530 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
531 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
532
533 $self->parse_xml($strListMdFormatsText);
534 print STDERR "\n";
535
536 print STDERR "=" x 10, "\n";
537 print STDERR "List Information:\n";
538 print STDERR "=" x 10, "\n";
539
540 my $strListSetCMD = $strBaseCMD;
541 $strListSetCMD =~ s/_OPTS_/$strListSets/;
542 my $strListSetsText = $self->useWget($strListSetCMD);
543
544 $self->parse_xml($strListSetsText);
545}
546
547sub parse_xml
548{
549 my ($self) = shift (@_);
550 my ($xml_text) = @_;
551
552 #### change this to work directly from $xml_text
553
554 #Open a temporary file to store OAI information, and store the information to the temp file
555 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
556
557 open(*OAIOUT,"> $name");
558
559 print OAIOUT $xml_text;
560 close(OAIOUT);
561
562 $self->{'temp_file_name'} = $name;
563
564## print STDERR "**** xml text = $xml_text\n";
565
566 eval {
567 $self->{'parser'}->parsefile("$name");
568## $self->{'parser'}->parse($xml_text);
569 };
570
571 if ($@) {
572 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
573## die "OAI: Parsed text is not a well formed XML file ($@)\n";
574 }
575
576 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
577}
578
579####END
580#{
581# if($self->{'info'})
582# {
583# unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
584# }
585#}
586
587# This Char function overrides the one in XML::Parser::Stream to overcome a
588# problem where $expat->{Text} is treated as the return value, slowing
589# things down significantly in some cases.
590sub Char {
591 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
592 $_[0]->{'Text'} .= $_[1];
593
594 my $self = $_[0]->{'PluginObj'};
595 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
596 $self->{'text'} .= $_[1];
597 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
598 if($self->{'text'} ne "")
599 {
600 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
601 }
602 }
603 return undef;
604}
605
606sub OAI_StartTag
607{
608 my ($expat, $element, %attr) = @_;
609
610 my $self = $expat->{'PluginObj'};
611 $self->{'subfield'} = $element;
612
613}
614
615sub OAI_EndTag
616{
617 my ($expat, $element) = @_;
618
619 my $self = $expat->{'PluginObj'};
620 $self->{'text'} = "";
621 $self->{'subfield'} = "";
622}
623
624sub error
625{
626 my ($self,$strFunctionName,$strError) = @_;
627 {
628 print "Error occoured in OAIDownload.pm\n".
629 "In Function:".$strFunctionName."\n".
630 "Error Message:".$strError."\n";
631 exit(-1);
632 }
633}
634
635
636
6371;
Note: See TracBrowser for help on using the repository browser.