root/gsdl/trunk/perllib/downloaders/OAIDownload.pm @ 16704

Revision 16704, 12.0 KB (checked in by mdewsnip, 11 years ago)

Fixed two bugs with resumption token support.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43    @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{OAIDownload.url_disp}",
49    'desc' => "{OAIDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "metadata_prefix",
53    'disp' => "{OAIDownload.metadata_prefix_disp}",
54    'desc' => "{OAIDownload.metadata_prefix}",
55    'type' => "string",
56    'deft' => "oai_dc",
57    'reqd' => "no"},
58      { 'name' => "set",
59    'disp' => "{OAIDownload.set_disp}",
60    'desc' => "{OAIDownload.set}",
61    'type' => "string",
62    'reqd' => "no"},
63      { 'name' => "get_doc",
64    'disp' => "{OAIDownload.get_doc_disp}",
65    'desc' => "{OAIDownload.get_doc}",
66    'type' => "flag",
67    'reqd' => "no"},
68      { 'name' => "max_records",
69    'disp' => "{OAIDownload.max_records_disp}",
70    'desc' => "{OAIDownload.max_records}",
71    'type' => "int",
72    'deft' => "500",
73    'range' => "1,",
74    'reqd' => "no"} ];
75
76my $options = { 'name'     => "OAIDownload",
77        'desc'     => "{OAIDownload.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'args'     => $arguments };
81
82my $self;
83
84my $strWgetOptions="";
85
86sub new
87{
88    my ($class) = shift (@_);
89    my ($getlist,$inputargs,$hashArgOptLists) = @_;
90    push(@$getlist, $class);
91
92    if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
93    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
94
95    $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
96
97    if ($self->{'info_only'}) {
98    # don't worry about any options etc
99    return bless $self, $class;
100    }
101
102    my $parser = new XML::Parser('Style' => 'Stream',
103                 'Handlers' => {'Char' => \&Char,
104                        'Start' => \&OAI_StartTag,
105                        'End' => \&OAI_EndTag
106                        });
107    $self->{'parser'} = $parser;
108   
109    # make sure the tmp directory that we will use later exists
110    my $tmp_dir = "$ENV{GSDLHOME}/tmp";
111    if (! -e $tmp_dir) {
112    &util::mk_dir($tmp_dir);
113    }
114   
115    return bless $self, $class;
116}
117
118sub download
119{
120    my ($self) = shift (@_);
121    my ($hashGeneralOptions) = @_;
122
123##    print STDERR "here2";
124   
125    $strWgetOptions = $self->getWgetOptions();
126    my $cmdWget = $strWgetOptions;
127 
128    my $strOutputDir ="";
129    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
130    my $strBasURL = $self->{'url'};
131    my $intMaxRecords = $self->{'max_records'};
132    my $blnDownloadDoc = $self->{'get_doc'};
133
134    print STDERR "<<Defined Maximum>>\n";
135
136    my $strIDs = $self->getOAIIDs($strBasURL);
137 
138   if($strIDs eq "")
139    {
140    print STDERR "Error: No ID being found\n";
141    return 0;
142    }
143    my $aryIDs = $self->parseOAIIDs($strIDs);
144    my $intIDs = 0;
145    if($self->{'max_records'} < scalar(@$aryIDs))
146    {
147    $intIDs = $self->{'max_records'};
148    }
149    else
150    {
151    $intIDs = scalar(@$aryIDs);
152    }
153    print STDERR "<<Total number of record(s):$intIDs>>\n";
154
155    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
156
157    my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
158    &util::rm($tmp_file);
159
160    return 1;
161}
162
163sub getOAIIDs
164{
165    my ($self,$strBasURL) = @_;
166    my ($cmdWget);
167     
168    my $wgetOptions = $self->getWgetOptions();
169
170    $cmdWget = $wgetOptions;
171 
172    print STDERR  "Gathering OAI identifiers.....\n";
173
174    my $metadata_prefix = $self->{'metadata_prefix'};
175    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
176
177    # if $set specified, add it in to URL
178    my $set = $self->{'set'};
179    $cmdWget .= "&set=$set" if ($set ne "");
180
181    $cmdWget .= "\" ";
182
183    my $accumulated_strIDs = "";
184    my $strIDs =  $self->useWget($cmdWget);
185
186    if (!defined $strIDs or $strIDs eq ""  ){
187    print STDERR "Server information is unavailable.\n";
188    print STDERR "<<Finished>>\n";
189        return; 
190    }
191
192    print STDERR "<<Download Information>>\n";
193   
194    $self->parse_xml($strIDs);
195
196    $accumulated_strIDs = $strIDs;
197
198    while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/) {
199    # top up list with further requests for IDs
200
201    my $resumption_token = $1;
202
203    $cmdWget = $wgetOptions;
204
205    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix&resumptionToken=$resumption_token\"";
206
207    $strIDs =  $self->useWget($cmdWget);
208
209    $self->parse_xml($strIDs);
210
211    $accumulated_strIDs .= $strIDs;
212
213    my @accumulated_identifiers
214        = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
215
216    my $num_acc_identifiers = scalar(@accumulated_identifiers);
217    if ($num_acc_identifiers > $self->{'max_records'}) {
218        last;
219    }
220    }
221
222    return $accumulated_strIDs;
223}
224
225sub parseOAIIDs
226{   
227    my ($self,$strIDs) = @_;
228
229    print STDERR "Parsing OAI identifiers.....\n";
230    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
231    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
232
233    my @aryIDs = ();
234
235    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
236    {
237    $strIDs = $2;
238    push(@aryIDs,$1);
239    }
240   
241    return \@aryIDs;
242}
243
244sub dirFileSplit
245{
246    my ($self,$strFile) = @_;
247
248    my @aryDirs = split("[/\]",$strFile);
249   
250    my $strLocalFile = pop(@aryDirs);
251    my $strSubDirs = join("/",@aryDirs);
252
253    return ($strSubDirs,$strLocalFile);
254}
255
256sub getOAIDoc
257{
258    my ($self,$strRecord, $strSubDirPath) = @_;
259 
260    print  STDERR "Gathering source documents.....\n";
261    # look out for identifier tag in metadata section
262   
263    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
264    {
265    my $strMetaTag = $1;
266   
267    if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
268    {
269        my $strDocURL = $2;
270
271        my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
272
273            my $strSoureDirPath ="";
274
275        $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
276
277        &util::mk_dir($strSoureDirPath)  if (!-e "$strSoureDirPath");
278       
279        my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
280       
281        my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\"";
282
283        my $strResponse =  $self->useWget($wget_cmd,1);
284
285        if($strResponse ne "")
286        {
287        print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
288        exit(-1);
289        }
290
291        $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n   <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
292    }
293    else
294    {
295        print  STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
296    }
297    }
298    else
299    {
300    print  STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
301    }
302   
303}
304
305sub getOAIRecords
306{
307    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
308
309    my $intDocCounter = 0;
310
311    my $metadata_prefix = $self->{'metadata_prefix'};
312
313    foreach my $strID ( @$aryIDs)
314    {
315    print  STDERR "Gathering OAI record with ID:$strID.....\n";
316       
317    my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
318
319    my $strRecord =  $self->useWget($cmdWget);
320
321       
322        my @fileDirs = split(":",$strID); 
323
324    # setup directories
325
326        $strOutputDir  =~ s/"//g; #"
327
328        my $host =$self->{'url'};
329 
330        $host =~ s/http:\/\///g;
331
332        $host =~ s/:.*//g;
333
334    my $midDir = join ("/",@fileDirs);
335    my $strFileURL = "$strOutputDir/$host/".$midDir.".oai";
336
337    # prepare subdirectory for record (if needed)
338    my ($strSubDirPath,$unused) = ("", "");
339
340        ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
341   
342    &util::mk_all_dir($strSubDirPath);
343
344    my $ds = &util::get_dirsep();
345   
346    if($blnDownloadDoc)
347    {
348        $self->getOAIDoc($strRecord,$strSubDirPath);
349    }
350
351    # save record
352    open (OAIOUT,">$strFileURL")
353        || die "Unable to save oai metadata record: $!\n";
354    print OAIOUT $strRecord;
355    close(OAIOUT);
356
357        print STDERR "Saving records to $strFileURL\n";
358        print STDERR "<<Done>>\n";
359    $intDocCounter ++; 
360    last if ($intDocCounter >= $intMaxRecords);
361    }
362
363    ($intDocCounter >= $intMaxRecords) ?
364    print  STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
365    print  STDERR "Complete download meta record from $strBasURL\n";
366
367       print STDERR "<<Finished>>\n";
368}
369
370sub url_information
371{
372    my ($self) = shift (@_);
373    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
374   
375    my $wgetOptions = $self->getWgetOptions();
376    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
377 
378    my $strIdentify = "verb=Identify";
379    my $strListSets = "verb=ListSets";
380
381    my $strIdentifyCMD = $strBaseCMD;
382    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 
383
384    my $strIdentifyText = $self->useWget($strIdentifyCMD);
385
386     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
387    print STDERR "Server information is unavailable.\n";
388    print STDERR "<<Finished>>\n";
389        return; 
390    }
391
392    print STDERR "General information:\n";
393    $self->parse_xml($strIdentifyText);
394
395    my $strListSetCMD = $strBaseCMD;
396    $strListSetCMD =~ s/_OPTS_/$strListSets/;   
397    my $strListSetsText = $self->useWget($strListSetCMD);
398
399
400    print STDERR "List Information:\n";
401    $self->parse_xml($strListSetsText);
402}
403
404sub parse_xml
405{   
406    my ($self) = shift (@_);
407    my ($strOutputText) = @_;
408   
409    #Open a temporary file to store OAI information, and store the information to the temp file
410    my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
411
412    open(*OAIOUT,"> $name");
413   
414    print OAIOUT $strOutputText;
415    close(OAIOUT);
416
417    $self->{'temp_file_name'} = $name;
418
419    eval {
420    $self->{'parser'}->parsefile("$name");
421    };
422   
423    if ($@) {
424    die "OAI: $name is not a well formed XML file ($@)\n";
425    }
426}
427
428END{
429    if($self->{'info'})
430    {
431    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
432    }
433}
434
435# This Char function overrides the one in XML::Parser::Stream to overcome a
436# problem where $expat->{Text} is treated as the return value, slowing
437# things down significantly in some cases.
438sub Char {   
439    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
440    $_[0]->{'Text'} .= $_[1];
441    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
442    $self->{'text'} .= $_[1];
443    $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
444    if($self->{'text'} ne "")
445    {       
446        print STDERR " $self->{'subfield'}:($self->{'text'})\n";
447    }
448    }
449    return undef;
450}
451
452sub OAI_StartTag
453{
454    my ($expat, $element, %attr) = @_;
455
456    $self->{'subfield'} = $element;
457   
458}
459
460sub OAI_EndTag
461{
462    my ($expat, $element) = @_;
463    $self->{'text'} = "";
464    $self->{'subfield'} = "";
465}
466
467sub error
468{
469    my ($self,$strFunctionName,$strError) = @_;
470    {
471    print "Error occoured in OAIDownload.pm\n".
472        "In Function:".$strFunctionName."\n".
473        "Error Message:".$strError."\n";
474    exit(-1);
475    }
476}
477
478
479
4801;
Note: See TracBrowser for help on using the browser.