source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 14948

Last change on this file since 14948 was 14948, checked in by davidb, 16 years ago

Added support for resumptionToken

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
RevLine 
[11783]1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
[12465]40use util;
[11783]41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
[14941]52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
[11783]58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
[12465]64 'disp' => "{OAIDownload.get_doc_disp}",
[11783]65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "max_records",
69 'disp' => "{OAIDownload.max_records_disp}",
70 'desc' => "{OAIDownload.max_records}",
71 'type' => "int",
72 'deft' => "500",
73 'range' => "1,",
74 'reqd' => "no"} ];
75
76my $options = { 'name' => "OAIDownload",
77 'desc' => "{OAIDownload.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'args' => $arguments };
81
82my $self;
83
[12465]84my $strWgetOptions="";
85
[11783]86sub new
87{
88 my ($class) = shift (@_);
89 my ($getlist,$inputargs,$hashArgOptLists) = @_;
90 push(@$getlist, $class);
91
92 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95 $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
96
97 if ($self->{'info_only'}) {
98 # don't worry about any options etc
99 return bless $self, $class;
100 }
101
102 my $parser = new XML::Parser('Style' => 'Stream',
103 'Handlers' => {'Char' => \&Char,
104 'Start' => \&OAI_StartTag,
105 'End' => \&OAI_EndTag
106 });
107 $self->{'parser'} = $parser;
108
[13961]109 # make sure the tmp directory that we will use later exists
110 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
111 if (! -e $tmp_dir) {
112 &util::mk_dir($tmp_dir);
113 }
114
[11783]115 return bless $self, $class;
116}
117
118sub download
119{
120 my ($self) = shift (@_);
121 my ($hashGeneralOptions) = @_;
122
[14948]123## print STDERR "here2";
[12465]124
125 $strWgetOptions = $self->getWgetOptions();
126 my $cmdWget = $strWgetOptions;
127
128 my $strOutputDir ="";
129 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
[11783]130 my $strBasURL = $self->{'url'};
131 my $intMaxRecords = $self->{'max_records'};
132 my $blnDownloadDoc = $self->{'get_doc'};
133
134 print STDERR "<<Defined Maximum>>\n";
[12465]135
136 my $strIDs = $self->getOAIIDs($strBasURL);
137
138 if($strIDs eq "")
[11783]139 {
140 print STDERR "Error: No ID being found\n";
141 return 0;
142 }
[12465]143 my $aryIDs = $self->parseOAIIDs($strIDs);
[11783]144 my $intIDs = 0;
145 if($self->{'max_records'} < scalar(@$aryIDs))
146 {
147 $intIDs = $self->{'max_records'};
148 }
149 else
150 {
151 $intIDs = scalar(@$aryIDs);
152 }
153 print STDERR "<<Total number of record(s):$intIDs>>\n";
154
[12465]155 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
[11783]156
[12465]157 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
158 &util::rm($tmp_file);
159
[11783]160 return 1;
161}
162
163sub getOAIIDs
164{
165 my ($self,$strBasURL) = @_;
166 my ($cmdWget);
[12465]167
168 my $wgetOptions = $self->getWgetOptions();
169
170 $cmdWget = $wgetOptions;
171
[11783]172 print STDERR "Gathering OAI identifiers.....\n";
[12465]173
[14941]174 my $metadata_prefix = $self->{'metadata_prefix'};
175 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
[12465]176
[14941]177 # if $set specified, add it in to URL
178 my $set = $self->{'set'};
179 $cmdWget .= "&set=$set" if ($set ne "");
180
181 $cmdWget .= "\" ";
182
[14948]183 my $accumulated_strIDs = "";
[12465]184 my $strIDs = $self->useWget($cmdWget);
185
186 if (!defined $strIDs or $strIDs eq "" ){
187 print STDERR "Server information is unavailable.\n";
188 print STDERR "<<Finished>>\n";
189 return;
190 }
191
192 print STDERR "<<Download Information>>\n";
[14948]193
[12465]194 $self->parse_xml($strIDs);
195
[14948]196 $accumulated_strIDs = $strIDs;
197
198 while ($strIDs =~ m/<resumptionToken.*?>(.*?)<\/resumptionToken>/s) {
199 # top up list with further requests for IDs
200
201 my $resumption_token = $1;
202
203 $cmdWget = $wgetOptions;
204
205 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
206
207 $strIDs = $self->useWget($cmdWget);
208
209 $self->parse_xml($strIDs);
210
211 $accumulated_strIDs .= $strIDs;
212
213 my @accumulated_identifiers
214 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
215
216 my $num_acc_identifiers = scalar(@accumulated_identifiers);
217 if ($num_acc_identifiers > $self->{'max_records'}) {
218 last;
219 }
220 }
221
222 return $accumulated_strIDs;
[11783]223}
224
225sub parseOAIIDs
226{
[12465]227 my ($self,$strIDs) = @_;
[11783]228
229 print STDERR "Parsing OAI identifiers.....\n";
230 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
231 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
232
233 my @aryIDs = ();
234
235 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
236 {
237 $strIDs = $2;
238 push(@aryIDs,$1);
239 }
240
241 return \@aryIDs;
242}
243
244sub dirFileSplit
245{
[12465]246 my ($self,$strFile) = @_;
[11783]247
[12465]248 my @aryDirs = split("[/\]",$strFile);
249
[11783]250 my $strLocalFile = pop(@aryDirs);
251 my $strSubDirs = join("/",@aryDirs);
252
253 return ($strSubDirs,$strLocalFile);
254}
255
256sub getOAIDoc
257{
[12465]258 my ($self,$strRecord, $strSubDirPath) = @_;
259
[11783]260 print STDERR "Gathering source documents.....\n";
261 # look out for identifier tag in metadata section
[12465]262
[11783]263 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
264 {
265 my $strMetaTag = $1;
266
267 if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
268 {
269 my $strDocURL = $2;
270
[12465]271 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
[11783]272
[12465]273 my $strSoureDirPath ="";
274
275 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
276
[11783]277 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath");
278
279 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
280
[14179]281 my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\"";
[11783]282
[12465]283 my $strResponse = $self->useWget($wget_cmd,1);
[11783]284
285 if($strResponse ne "")
286 {
287 print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
288 exit(-1);
289 }
290
291 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
292 }
293 else
294 {
295 print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
296 }
297 }
298 else
299 {
300 print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
301 }
302
303}
304
305sub getOAIRecords
306{
[12465]307 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
[11783]308
309 my $intDocCounter = 0;
310
[14941]311 my $metadata_prefix = $self->{'metadata_prefix'};
312
[11783]313 foreach my $strID ( @$aryIDs)
314 {
315 print STDERR "Gathering OAI record with ID:$strID.....\n";
[12465]316
[14941]317 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
[11783]318
[12465]319 my $strRecord = $self->useWget($cmdWget);
320
321
322 my @fileDirs = split(":",$strID);
323
[11783]324 # setup directories
325
[12580]326 $strOutputDir =~ s/"//g; #"
[13065]327
328 my $host =$self->{'url'};
[12465]329
[13065]330 $host =~ s/http:\/\///g;
331
332 $host =~ s/:.*//g;
333
[14179]334 my $midDir = join ("/",@fileDirs);
335 my $strFileURL = "$strOutputDir/$host/".$midDir.".oai";
336
[11783]337 # prepare subdirectory for record (if needed)
[12465]338 my ($strSubDirPath,$unused) = ("", "");
339
340 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
341
[11783]342 &util::mk_all_dir($strSubDirPath);
343
344 my $ds = &util::get_dirsep();
[12465]345
[11783]346 if($blnDownloadDoc)
347 {
[12465]348 $self->getOAIDoc($strRecord,$strSubDirPath);
[11783]349 }
350
351 # save record
[12465]352 open (OAIOUT,">$strFileURL")
[11783]353 || die "Unable to save oai metadata record: $!\n";
354 print OAIOUT $strRecord;
355 close(OAIOUT);
356
[12465]357 print STDERR "Saving records to $strFileURL\n";
358 print STDERR "<<Done>>\n";
359 $intDocCounter ++;
[11783]360 last if ($intDocCounter >= $intMaxRecords);
361 }
[12465]362
[11783]363 ($intDocCounter >= $intMaxRecords) ?
[14926]364 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
[11783]365 print STDERR "Complete download meta record from $strBasURL\n";
366
[12465]367 print STDERR "<<Finished>>\n";
[11783]368}
369
370sub url_information
371{
372 my ($self) = shift (@_);
373 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
374
[12465]375 my $wgetOptions = $self->getWgetOptions();
376 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
[11783]377
378 my $strIdentify = "verb=Identify";
379 my $strListSets = "verb=ListSets";
380
381 my $strIdentifyCMD = $strBaseCMD;
382 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
383
[12465]384 my $strIdentifyText = $self->useWget($strIdentifyCMD);
[11783]385
[12465]386 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
387 print STDERR "Server information is unavailable.\n";
388 print STDERR "<<Finished>>\n";
389 return;
390 }
391
[11783]392 print STDERR "General information:\n";
393 $self->parse_xml($strIdentifyText);
394
395 my $strListSetCMD = $strBaseCMD;
396 $strListSetCMD =~ s/_OPTS_/$strListSets/;
[12465]397 my $strListSetsText = $self->useWget($strListSetCMD);
398
399
[11783]400 print STDERR "List Information:\n";
401 $self->parse_xml($strListSetsText);
402}
403
404sub parse_xml
405{
406 my ($self) = shift (@_);
407 my ($strOutputText) = @_;
[12465]408
[11783]409 #Open a temporary file to store OAI information, and store the information to the temp file
[12465]410 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
[11783]411
[12465]412 open(*OAIOUT,"> $name");
413
414 print OAIOUT $strOutputText;
415 close(OAIOUT);
416
[11783]417 $self->{'temp_file_name'} = $name;
418
419 eval {
420 $self->{'parser'}->parsefile("$name");
421 };
422
423 if ($@) {
424 die "OAI: $name is not a well formed XML file ($@)\n";
425 }
426}
427
428END{
429 if($self->{'info'})
430 {
431 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
432 }
433}
434
435# This Char function overrides the one in XML::Parser::Stream to overcome a
436# problem where $expat->{Text} is treated as the return value, slowing
437# things down significantly in some cases.
438sub Char {
439 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
440 $_[0]->{'Text'} .= $_[1];
441 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
442 $self->{'text'} .= $_[1];
[12465]443 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
[11783]444 if($self->{'text'} ne "")
445 {
446 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
447 }
448 }
449 return undef;
450}
451
452sub OAI_StartTag
453{
454 my ($expat, $element, %attr) = @_;
[12465]455
[11783]456 $self->{'subfield'} = $element;
[12465]457
[11783]458}
459
460sub OAI_EndTag
461{
462 my ($expat, $element) = @_;
463 $self->{'text'} = "";
464 $self->{'subfield'} = "";
465}
466
467sub error
468{
[12465]469 my ($self,$strFunctionName,$strError) = @_;
[11783]470 {
471 print "Error occoured in OAIDownload.pm\n".
472 "In Function:".$strFunctionName."\n".
473 "Error Message:".$strError."\n";
474 exit(-1);
475 }
476}
477
478
479
4801;
Note: See TracBrowser for help on using the repository browser.