source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 20926

Last change on this file since 20926 was 20926, checked in by kjdon, 14 years ago

if can't download a source doc for some reason, move on to the next one. Don't quit, otherwise won't get any more of the records downloaded. This way, at least one gets the records even if can't get the source docs. doc urls may have ' in them, convert to ' before trying to download

  • Property svn:keywords set to Author Date Id Revision
File size: 16.5 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
64 'disp' => "{OAIDownload.get_doc_disp}",
65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "get_doc_exts",
69 'disp' => "{OAIDownload.get_doc_exts_disp}",
70 'desc' => "{OAIDownload.get_doc_exts}",
71 'type' => "string",
72 'deft' => "doc,pdf,ppt",
73 'reqd' => "no"},
74 { 'name' => "max_records",
75 'disp' => "{OAIDownload.max_records_disp}",
76 'desc' => "{OAIDownload.max_records}",
77 'type' => "int",
78 'deft' => "500",
79 'range' => "1,",
80 'reqd' => "no"} ];
81
82my $options = { 'name' => "OAIDownload",
83 'desc' => "{OAIDownload.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
88##my $self;
89
90#### my $strWgetOptions="";
91
92sub new
93{
94 my ($class) = shift (@_);
95 my ($getlist,$inputargs,$hashArgOptLists) = @_;
96 push(@$getlist, $class);
97
98 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99 push(@{$hashArgOptLists->{"OptList"}},$options);
100
101 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
102
103 if ($self->{'info_only'}) {
104 # don't worry about any options etc
105 return bless $self, $class;
106 }
107
108 my $parser = new XML::Parser('Style' => 'Stream',
109 'PluginObj' => $self,
110 'Handlers' => {'Char' => \&Char,
111 'Start' => \&OAI_StartTag,
112 'End' => \&OAI_EndTag
113 });
114 $self->{'parser'} = $parser;
115
116 # make sure the tmp directory that we will use later exists
117 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
118 if (! -e $tmp_dir) {
119 &util::mk_dir($tmp_dir);
120 }
121
122
123 # set up hashmap for individual items in get_doc_exts
124 # to make testing for matches easier
125
126 $self->{'lookup_exts'} = {};
127 my $get_doc_exts = $self->{'get_doc_exts'};
128
129 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
130 my @exts = split(/,\s*/,$get_doc_exts);
131 foreach my $e (@exts) {
132 $self->{'lookup_exts'}->{lc($e)} = 1;
133 }
134 }
135
136
137 return bless $self, $class;
138}
139
140sub download
141{
142 my ($self) = shift (@_);
143 my ($hashGeneralOptions) = @_;
144
145## my $cmdWget = $strWgetOptions;
146
147 my $strOutputDir ="";
148 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
149 my $strBasURL = $self->{'url'};
150 my $intMaxRecords = $self->{'max_records'};
151 my $blnDownloadDoc = $self->{'get_doc'};
152
153 print STDERR "<<Defined Maximum>>\n";
154
155 my $strIDs = $self->getOAIIDs($strBasURL);
156
157 if($strIDs eq "")
158 {
159 print STDERR "Error: No IDs found\n";
160 return 0;
161 }
162
163 my $aryIDs = $self->parseOAIIDs($strIDs);
164 my $intIDs = 0;
165 if($self->{'max_records'} < scalar(@$aryIDs))
166 {
167 $intIDs = $self->{'max_records'};
168 }
169 else
170 {
171 $intIDs = scalar(@$aryIDs);
172 }
173 print STDERR "<<Total number of record(s):$intIDs>>\n";
174
175 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
176
177# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
178# &util::rm($tmp_file);
179
180 return 1;
181}
182
183sub getOAIIDs
184{
185 my ($self,$strBasURL) = @_;
186## my ($cmdWget);
187
188 my $wgetOptions = $self->getWgetOptions();
189
190 my $cmdWget = $wgetOptions;
191
192 print STDERR "Gathering OAI identifiers.....\n";
193
194 my $metadata_prefix = $self->{'metadata_prefix'};
195 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
196
197 # if $set specified, add it in to URL
198 my $set = $self->{'set'};
199 $cmdWget .= "&set=$set" if ($set ne "");
200
201 $cmdWget .= "\" ";
202
203 my $accumulated_strIDs = "";
204 my $strIDs = $self->useWget($cmdWget);
205
206 if (!defined $strIDs or $strIDs eq "" ){
207 print STDERR "Server information is unavailable.\n";
208 print STDERR "<<Finished>>\n";
209 return;
210 }
211 if ($self->{'forced_quit'}) {
212 return $strIDs;
213 }
214
215 print STDERR "<<Download Information>>\n";
216
217 $self->parse_xml($strIDs);
218
219 $accumulated_strIDs = $strIDs;
220
221 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/) {
222 # top up list with further requests for IDs
223
224 my $resumption_token = $1;
225
226 $cmdWget = $wgetOptions;
227
228 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
229
230 $strIDs = $self->useWget($cmdWget);
231 if ($self->{'forced_quit'}) {
232 return $accumulated_strIDs;
233 }
234
235 $self->parse_xml($strIDs);
236
237 $accumulated_strIDs .= $strIDs;
238
239 my @accumulated_identifiers
240 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
241
242 my $num_acc_identifiers = scalar(@accumulated_identifiers);
243 if ($num_acc_identifiers > $self->{'max_records'}) {
244 last;
245 }
246 }
247
248 return $accumulated_strIDs;
249}
250
251sub parseOAIIDs
252{
253 my ($self,$strIDs) = @_;
254
255 print STDERR "Parsing OAI identifiers.....\n";
256 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
257 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
258
259 my @aryIDs = ();
260
261 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
262 {
263 $strIDs = $2;
264 push(@aryIDs,$1);
265 }
266
267 return \@aryIDs;
268}
269
270sub dirFileSplit
271{
272 my ($self,$strFile) = @_;
273
274 my @aryDirs = split("[/\]",$strFile);
275
276 my $strLocalFile = pop(@aryDirs);
277 my $strSubDirs = join("/",@aryDirs);
278
279 return ($strSubDirs,$strLocalFile);
280}
281
282sub getOAIDoc
283{
284 my ($self,$strRecord, $oai_rec_filename) = @_;
285
286 print STDERR "Gathering source documents.....\n";
287 # look out for identifier tag in metadata section
288
289 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
290 {
291 my $strMetaTag = $1;
292 my $had_valid_url = 0;
293 my $count = 1;
294 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
295 {
296 my $doc_id_url = $2;
297 print STDERR "Found doc url: $doc_id_url\n";
298 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
299
300 my $orig_doc_id_url = $doc_id_url;
301 $had_valid_url = 1;
302
303 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
304 my $faked_ext = 0;
305 my $primary_doc_match = 0;
306
307 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
308
309 if (defined $id_file_ext) {
310 # cross-check this filename extension with get_doc_exts option
311 # if provided
312 my $lookup_exts = $self->{'lookup_exts'};
313
314 if (defined $lookup_exts->{lc($id_file_ext)}) {
315 # this initial URL matches requirement
316 $primary_doc_match = 1;
317 }
318 }
319 else {
320 $faked_ext = 1;
321 $id_file_ext = "html";
322 }
323
324
325 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
326 # Download this doc if HTML, scan through it looking for a link
327 # that does match get_doc_exts
328
329
330 # 1. Generate a tmp name
331 my $tmp_filename = &util::get_tmp_filename();
332
333 # 2. Download it
334 my $wget_opts2 = $self->getWgetOptions();
335 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
336 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
337 return $strRecord if $self->{'forced_quit'};
338
339 if($error2 ne "")
340 {
341 print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
342 #exit(-1);
343 next;
344 }
345
346 if (defined $follow2) {
347 # src url was "redirected" to another place
348 # => pick up on this and make it the new doc_id_url
349 $doc_id_url = $follow2;
350 }
351
352 my $primary_doc_html = "";
353 if (open(HIN,"<$tmp_filename")) {
354 my $line;
355 while (defined ($line = <HIN>)) {
356 $primary_doc_html .= $line;
357 }
358 close(HIN);
359
360 # 3. Scan through it looking for match
361 #
362 # if got match, change $doc_id_url to this new URL and
363 # $id_file_ext to 'match'
364
365 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
366
367 my $lookup_exts = $self->{'lookup_exts'};
368
369 foreach my $href (@href_links) {
370 my ($ext) = ($href =~ m/\.([^\.]+)$/);
371
372 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
373
374 if ($href !~ m/^(https?|ftp):\/\//) {
375 # link is within current site
376 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
377
378 $href = "$site_domain$href";
379 }
380
381 $doc_id_url = $href;
382 $id_file_ext = $ext;
383 last;
384 }
385 }
386 }
387 else {
388 print STDERR "Error occurred while retrieving OAI source documents (2):\n";
389 print STDERR "$!\n";
390 }
391
392 if (-e $tmp_filename) {
393 &util::rm($tmp_filename);
394 }
395 }
396
397 my $download_doc_filename = $oai_rec_filename;
398 my $new_extension = "\-$count\.$id_file_ext";
399 $count++;
400 #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
401 $download_doc_filename =~ s/\.oai$/$new_extension/;
402 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
403
404 # may have &apos; in url - others??
405 my $safe_doc_id_url = $doc_id_url;
406 $safe_doc_id_url =~ s/&apos;/\'/g;
407
408 my $wget_opts = $self->getWgetOptions();
409 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
410
411 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
412 return $strRecord if $self->{'forced_quit'};
413
414 if($errors ne "")
415 {
416 print STDERR "Error occured while retriving OAI souce documents (3):\n";
417 print STDERR "$errors\n";
418 #exit(-1);
419 next;
420 }
421
422
423 $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
424 }
425
426 if (!$had_valid_url)
427 {
428 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
429 }
430 }
431 else
432 {
433 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
434 }
435
436 return $strRecord;
437}
438
439sub getOAIRecords
440{
441 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
442
443 my $intDocCounter = 0;
444
445 my $metadata_prefix = $self->{'metadata_prefix'};
446
447 foreach my $strID ( @$aryIDs)
448 {
449 print STDERR "Gathering OAI record with ID $strID.....\n";
450
451 my $wget_opts = $self->getWgetOptions();
452 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
453
454 my $strRecord = $self->useWget($cmdWget);
455
456 my @fileDirs = split(":",$strID);
457 my $local_id = pop @fileDirs;
458
459 # setup directories
460
461 $strOutputDir =~ s/"//g; #"
462
463 my $host =$self->{'url'};
464
465 $host =~ s/https?:\/\///g;
466
467 $host =~ s/:.*//g;
468
469 my $strFileURL = "$strOutputDir/$host/$local_id.oai";
470
471
472 # prepare subdirectory for record (if needed)
473 my ($strSubDirPath,$unused) = ("", "");
474
475 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
476
477 &util::mk_all_dir($strSubDirPath);
478
479 my $ds = &util::get_dirsep();
480
481 if($blnDownloadDoc)
482 {
483 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
484 }
485
486 # save record
487 open (OAIOUT,">$strFileURL")
488 || die "Unable to save oai metadata record: $!\n";
489 print OAIOUT $strRecord;
490 close(OAIOUT);
491
492 print STDERR "Saving records to $strFileURL\n";
493 print STDERR "<<Done>>\n";
494 $intDocCounter ++;
495 last if ($intDocCounter >= $intMaxRecords);
496 }
497
498 ($intDocCounter >= $intMaxRecords) ?
499 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
500 print STDERR "Complete download meta record from $strBasURL\n";
501
502 print STDERR "<<Finished>>\n";
503}
504
505sub url_information
506{
507 my ($self) = shift (@_);
508 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
509
510 my $wgetOptions = $self->getWgetOptions();
511 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
512
513 my $strIdentify = "verb=Identify";
514 my $strListSets = "verb=ListSets";
515 my $strListMdFormats = "verb=ListMetadataFormats";
516
517 my $strIdentifyCMD = $strBaseCMD;
518 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
519
520 my $strIdentifyText = $self->useWget($strIdentifyCMD);
521
522 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
523 print STDERR "Server information is unavailable.\n";
524 print STDERR "<<Finished>>\n";
525 return;
526 }
527
528 print STDERR "General information:\n";
529 $self->parse_xml($strIdentifyText);
530 print STDERR "\n";
531
532 print STDERR "=" x 10, "\n";
533 print STDERR "Metadata Format Information (metadataPrefix):\n";
534 print STDERR "=" x 10, "\n";
535
536 my $strListMdFormatsCMD = $strBaseCMD;
537 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
538 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
539
540 $self->parse_xml($strListMdFormatsText);
541 print STDERR "\n";
542
543 print STDERR "=" x 10, "\n";
544 print STDERR "List Information:\n";
545 print STDERR "=" x 10, "\n";
546
547 my $strListSetCMD = $strBaseCMD;
548 $strListSetCMD =~ s/_OPTS_/$strListSets/;
549 my $strListSetsText = $self->useWget($strListSetCMD);
550
551 $self->parse_xml($strListSetsText);
552}
553
554sub parse_xml
555{
556 my ($self) = shift (@_);
557 my ($xml_text) = @_;
558
559 #### change this to work directly from $xml_text
560
561 #Open a temporary file to store OAI information, and store the information to the temp file
562 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
563
564 open(*OAIOUT,"> $name");
565
566 print OAIOUT $xml_text;
567 close(OAIOUT);
568
569 $self->{'temp_file_name'} = $name;
570
571## print STDERR "**** xml text = $xml_text\n";
572
573 eval {
574 $self->{'parser'}->parsefile("$name");
575## $self->{'parser'}->parse($xml_text);
576 };
577
578 if ($@) {
579 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
580## die "OAI: Parsed text is not a well formed XML file ($@)\n";
581 }
582
583 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
584}
585
586####END
587#{
588# if($self->{'info'})
589# {
590# unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
591# }
592#}
593
594# This Char function overrides the one in XML::Parser::Stream to overcome a
595# problem where $expat->{Text} is treated as the return value, slowing
596# things down significantly in some cases.
597sub Char {
598 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
599 $_[0]->{'Text'} .= $_[1];
600
601 my $self = $_[0]->{'PluginObj'};
602 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
603 $self->{'text'} .= $_[1];
604 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
605 if($self->{'text'} ne "")
606 {
607 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
608 }
609 }
610 return undef;
611}
612
613sub OAI_StartTag
614{
615 my ($expat, $element, %attr) = @_;
616
617 my $self = $expat->{'PluginObj'};
618 $self->{'subfield'} = $element;
619
620}
621
622sub OAI_EndTag
623{
624 my ($expat, $element) = @_;
625
626 my $self = $expat->{'PluginObj'};
627 $self->{'text'} = "";
628 $self->{'subfield'} = "";
629}
630
631sub error
632{
633 my ($self,$strFunctionName,$strError) = @_;
634 {
635 print "Error occoured in OAIDownload.pm\n".
636 "In Function:".$strFunctionName."\n".
637 "Error Message:".$strError."\n";
638 exit(-1);
639 }
640}
641
642
643
6441;
Note: See TracBrowser for help on using the repository browser.