root/main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm @ 25224

Revision 25224, 16.7 KB (checked in by kjdon, 9 years ago)

removed default value for max_records option. If not specified, now this will download all records. Previously the only way to download all records was to set max_records to a bigger number than the number of records available. Also fixed a bug where if you didn't specify a place to download into, it would try to download into /path-based-on-url.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43    @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{OAIDownload.url_disp}",
49    'desc' => "{OAIDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "metadata_prefix",
53    'disp' => "{OAIDownload.metadata_prefix_disp}",
54    'desc' => "{OAIDownload.metadata_prefix}",
55    'type' => "string",
56    'deft' => "oai_dc",
57    'reqd' => "no"},
58      { 'name' => "set",
59    'disp' => "{OAIDownload.set_disp}",
60    'desc' => "{OAIDownload.set}",
61    'type' => "string",
62    'reqd' => "no"},
63      { 'name' => "get_doc",
64    'disp' => "{OAIDownload.get_doc_disp}",
65    'desc' => "{OAIDownload.get_doc}",
66    'type' => "flag",
67    'reqd' => "no"},
68      { 'name' => "get_doc_exts",
69    'disp' => "{OAIDownload.get_doc_exts_disp}",
70    'desc' => "{OAIDownload.get_doc_exts}",
71    'type' => "string",
72    'deft' => "doc,pdf,ppt",
73    'reqd' => "no"},
74      { 'name' => "max_records",
75    'disp' => "{OAIDownload.max_records_disp}",
76    'desc' => "{OAIDownload.max_records}",
77    'type' => "int",
78    'range' => "1,",
79    'reqd' => "no"} ];
80
81my $options = { 'name'     => "OAIDownload",
82        'desc'     => "{OAIDownload.desc}",
83        'abstract' => "no",
84        'inherits' => "yes",
85        'args'     => $arguments };
86
87sub new
88{
89    my ($class) = shift (@_);
90    my ($getlist,$inputargs,$hashArgOptLists) = @_;
91    push(@$getlist, $class);
92
93    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94    push(@{$hashArgOptLists->{"OptList"}},$options);
95
96    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
97
98    if ($self->{'info_only'}) {
99    # don't worry about any options etc
100    return bless $self, $class;
101    }
102
103    my $parser = new XML::Parser('Style' => 'Stream',
104                                 'PluginObj' => $self,
105                 'Handlers' => {'Char' => \&Char,
106                        'Start' => \&OAI_StartTag,
107                        'End' => \&OAI_EndTag
108                        });
109    $self->{'parser'} = $parser;
110   
111    # make sure the tmp directory that we will use later exists
112    my $tmp_dir = "$ENV{GSDLHOME}/tmp";
113    if (! -e $tmp_dir) {
114    &util::mk_dir($tmp_dir);
115    }
116
117    # if max_records not specified, parsing will have set it to ""
118    undef $self->{'max_records'} if $self->{'max_records'} eq "";
119
120    # set up hashmap for individual items in get_doc_exts
121    # to make testing for matches easier
122
123    $self->{'lookup_exts'} = {};
124    my $get_doc_exts = $self->{'get_doc_exts'};
125   
126    if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
127    my @exts = split(/,\s*/,$get_doc_exts);
128    foreach my $e (@exts) {
129        $self->{'lookup_exts'}->{lc($e)} = 1;
130    }
131    }
132
133
134    return bless $self, $class;
135}
136
137sub download
138{
139    my ($self) = shift (@_);
140    my ($hashGeneralOptions) = @_;
141
142##    my $cmdWget = $strWgetOptions;
143 
144    my $strOutputDir ="";
145    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
146    my $strBasURL = $self->{'url'};
147    my $blnDownloadDoc = $self->{'get_doc'};
148
149    print STDERR "<<Defined Maximum>>\n";
150
151    my $strIDs = $self->getOAIIDs($strBasURL);
152 
153    if($strIDs eq "")
154    {
155    print STDERR "Error: No IDs found\n";
156    return 0;
157    }
158
159    my $aryIDs = $self->parseOAIIDs($strIDs);
160    my $intIDs = 0;
161    if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
162    {
163    $intIDs = $self->{'max_records'};
164    }
165    else
166    {
167    $intIDs = scalar(@$aryIDs);
168    }
169    print STDERR "<<Total number of record(s):$intIDs>>\n";
170
171    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
172
173#    my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
174#    &util::rm($tmp_file);
175
176    return 1;
177}
178
179sub getOAIIDs
180{
181    my ($self,$strBasURL) = @_;
182##    my ($cmdWget);
183     
184    my $wgetOptions = $self->getWgetOptions();
185
186    my $cmdWget = $wgetOptions;
187 
188    print STDERR  "Gathering OAI identifiers.....\n";
189   
190    my $metadata_prefix = $self->{'metadata_prefix'};
191    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
192
193    # if $set specified, add it in to URL
194    my $set = $self->{'set'};
195    $cmdWget .= "&set=$set" if ($set ne "");
196
197    $cmdWget .= "\" ";
198
199    my $accumulated_strIDs = "";
200    my $strIDs =  $self->useWget($cmdWget);
201
202    if (!defined $strIDs or $strIDs eq ""  ){
203    print STDERR "Server information is unavailable.\n";
204    print STDERR "<<Finished>>\n";
205        return; 
206    }
207    if ($self->{'forced_quit'}) {
208    return $strIDs;
209    }
210
211    print STDERR "<<Download Information>>\n";
212   
213    $self->parse_xml($strIDs);
214
215    $accumulated_strIDs = $strIDs;
216    my $max_recs = $self->{'max_records'};
217    while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
218    # top up list with further requests for IDs
219
220    my $resumption_token = $1;
221
222    $cmdWget = $wgetOptions;
223
224    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
225
226    $strIDs =  $self->useWget($cmdWget);
227    if ($self->{'forced_quit'}) {
228        return $accumulated_strIDs;
229    }
230
231    $self->parse_xml($strIDs);
232
233    $accumulated_strIDs .= $strIDs;
234
235    my @accumulated_identifiers
236        = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
237
238    my $num_acc_identifiers = scalar(@accumulated_identifiers);
239    if (defined  $max_recs && $num_acc_identifiers > $max_recs ) {
240        last;
241    }
242    }
243
244    return $accumulated_strIDs;
245}
246
247sub parseOAIIDs
248{   
249    my ($self,$strIDs) = @_;
250
251    print STDERR "Parsing OAI identifiers.....\n";
252    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
253    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
254
255    my @aryIDs = ();
256
257    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
258    {
259    $strIDs = $2;
260    push(@aryIDs,$1);
261    }
262   
263    return \@aryIDs;
264}
265
266sub dirFileSplit
267{
268    my ($self,$strFile) = @_;
269
270    my @aryDirs = split("[/\]",$strFile);
271   
272    my $strLocalFile = pop(@aryDirs);
273    my $strSubDirs = join("/",@aryDirs);
274
275    return ($strSubDirs,$strLocalFile);
276}
277
278sub getOAIDoc
279{
280    my ($self,$strRecord, $oai_rec_filename) = @_;
281 
282    print  STDERR "Gathering source documents.....\n";
283    # look out for identifier tag in metadata section
284   
285    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
286    {
287    my $strMetaTag = $1;
288    my $had_valid_url = 0;
289    my $count = 1;
290    while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
291    {
292        my $doc_id_url = $2;
293        print STDERR "Found doc url: $doc_id_url\n";
294        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
295
296        my $orig_doc_id_url = $doc_id_url;
297        $had_valid_url = 1;
298
299        my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
300        my $faked_ext = 0;
301        my $primary_doc_match = 0;
302
303        my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
304
305        if (defined $id_file_ext) {
306        # cross-check this filename extension with get_doc_exts option
307        # if provided
308        my $lookup_exts = $self->{'lookup_exts'};
309
310        if (defined $lookup_exts->{lc($id_file_ext)}) {
311            # this initial URL matches requirement
312            $primary_doc_match = 1;
313        }
314        }
315        else {
316        $faked_ext = 1;
317        $id_file_ext = "html";
318        }
319
320       
321        if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
322        # Download this doc if HTML, scan through it looking for a link
323        # that does match get_doc_exts
324       
325
326        # 1. Generate a tmp name
327        my $tmp_filename = &util::get_tmp_filename();
328
329        # 2. Download it
330        my $wget_opts2 = $self->getWgetOptions();
331        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
332        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
333        return $strRecord if $self->{'forced_quit'};
334
335        if($error2 ne "")
336        {
337            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
338            #exit(-1);
339            next;
340        }
341
342        if (defined $follow2) {
343            # src url was "redirected" to another place
344            # => pick up on this and make it the new doc_id_url
345            $doc_id_url = $follow2;
346        }
347
348        my $primary_doc_html = "";
349        if (open(HIN,"<$tmp_filename")) {
350            my $line;
351            while (defined ($line = <HIN>)) {
352            $primary_doc_html .= $line;
353            }
354            close(HIN);
355
356            # 3. Scan through it looking for match
357            #
358            # if got match, change $doc_id_url to this new URL and
359            # $id_file_ext to 'match'
360           
361            my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
362
363            my $lookup_exts = $self->{'lookup_exts'};
364
365            foreach my $href (@href_links) {
366            my ($ext) = ($href =~ m/\.([^\.]+)$/);
367
368            if ((defined $ext) && (defined $lookup_exts->{$ext})) {
369
370                if ($href !~ m/^(https?|ftp):\/\//) {
371                # link is within current site
372                my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
373
374                $href = "$site_domain$href";
375                }
376
377                $doc_id_url = $href;
378                $id_file_ext = $ext;
379                last;
380            }
381            }
382        }
383        else {
384            print STDERR "Error occurred while retrieving OAI source documents (2):\n";
385            print STDERR "$!\n";
386        }
387
388        if (-e $tmp_filename) {
389            &util::rm($tmp_filename);
390        }
391        }
392
393        my $download_doc_filename = $oai_rec_filename;
394        my $new_extension = "\-$count\.$id_file_ext";
395        $count++;
396        #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
397        $download_doc_filename =~ s/\.oai$/$new_extension/;
398        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
399
400        # may have &apos; in url - others??
401        my $safe_doc_id_url = $doc_id_url;
402        $safe_doc_id_url =~ s/&apos;/\'/g;
403
404        my $wget_opts = $self->getWgetOptions();
405        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
406       
407        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
408        return $strRecord if $self->{'forced_quit'};
409
410        if($errors ne "")
411        {
412        print STDERR "Error occured while retriving OAI souce documents (3):\n";
413        print STDERR "$errors\n";
414        #exit(-1);
415        next;
416        }
417
418       
419        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
420    }
421
422    if (!$had_valid_url)
423    {
424        print  STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
425    }
426    }
427    else
428    {
429    print  STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
430    }
431
432    return $strRecord;
433}
434
435sub getOAIRecords
436{
437    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
438    my $intDocCounter = 0;
439
440    my $metadata_prefix = $self->{'metadata_prefix'};
441
442    foreach my $strID ( @$aryIDs)
443    {
444    print  STDERR "Gathering OAI record with ID $strID.....\n";
445       
446    my $wget_opts = $self->getWgetOptions();
447    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
448   
449    my $strRecord =  $self->useWget($cmdWget);
450
451    my @fileDirs = split(":",$strID); 
452    my $local_id = pop @fileDirs;
453
454    # setup directories
455
456        $strOutputDir  =~ s/"//g; #"
457
458        my $host =$self->{'url'};
459 
460        $host =~ s@https?:\/\/@@g;
461
462        $host =~ s/:.*//g;
463
464    my $strFileURL = "";
465    if ($strOutputDir ne "") {
466        $strFileURL = "$strOutputDir/";
467    }
468    $strFileURL .= "$host/$local_id.oai";
469
470    # prepare subdirectory for record (if needed)
471    my ($strSubDirPath,$unused) = ("", "");
472
473        ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
474   
475    &util::mk_all_dir($strSubDirPath);
476
477    my $ds = &util::get_dirsep();
478   
479    if($blnDownloadDoc)
480    {
481        $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
482    }
483
484    # save record
485    open (OAIOUT,">$strFileURL")
486        || die "Unable to save oai metadata record: $!\n";
487    print OAIOUT $strRecord;
488    close(OAIOUT);
489
490        print STDERR "Saving record to $strFileURL\n";
491        print STDERR "<<Done>>\n";
492    $intDocCounter ++; 
493    last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
494    }
495
496    (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
497    print  STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
498    print  STDERR "Complete download meta record from $strBasURL\n";
499
500       print STDERR "<<Finished>>\n";
501}
502
503sub url_information
504{
505    my ($self) = shift (@_);
506    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
507   
508    my $wgetOptions = $self->getWgetOptions();
509    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
510 
511    my $strIdentify = "verb=Identify";
512    my $strListSets = "verb=ListSets";
513    my $strListMdFormats = "verb=ListMetadataFormats";
514
515    my $strIdentifyCMD = $strBaseCMD;
516    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 
517
518    my $strIdentifyText = $self->useWget($strIdentifyCMD);
519
520     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
521    print STDERR "Server information is unavailable.\n";
522    print STDERR "<<Finished>>\n";
523        return; 
524    }
525
526    print STDERR "General information:\n";
527    $self->parse_xml($strIdentifyText);
528    print STDERR "\n";
529
530    print STDERR "=" x 10, "\n";
531    print STDERR "Metadata Format Information (metadataPrefix):\n";
532    print STDERR "=" x 10, "\n";
533
534    my $strListMdFormatsCMD = $strBaseCMD;
535    $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;   
536    my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
537
538    $self->parse_xml($strListMdFormatsText);
539    print STDERR "\n";
540
541    print STDERR "=" x 10, "\n";
542    print STDERR "List Information:\n";
543    print STDERR "=" x 10, "\n";
544
545    my $strListSetCMD = $strBaseCMD;
546    $strListSetCMD =~ s/_OPTS_/$strListSets/;   
547    my $strListSetsText = $self->useWget($strListSetCMD);
548
549    $self->parse_xml($strListSetsText);
550}
551
552sub parse_xml
553{   
554    my ($self) = shift (@_);
555    my ($xml_text) = @_;
556   
557    #### change this to work directly from $xml_text
558
559    #Open a temporary file to store OAI information, and store the information to the temp file
560    my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
561
562    open(*OAIOUT,"> $name");
563   
564    print OAIOUT $xml_text;
565    close(OAIOUT);
566
567    $self->{'temp_file_name'} = $name;
568
569##    print STDERR "**** xml text = $xml_text\n";
570
571    eval {
572    $self->{'parser'}->parsefile("$name");
573##  $self->{'parser'}->parse($xml_text);
574    };
575
576    if ($@) {
577    die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
578##  die "OAI: Parsed text is not a well formed XML file ($@)\n";
579    }
580
581    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
582}
583
584####END
585#{
586#    if($self->{'info'})
587#    {
588#   unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
589#    }
590#}
591
592# This Char function overrides the one in XML::Parser::Stream to overcome a
593# problem where $expat->{Text} is treated as the return value, slowing
594# things down significantly in some cases.
595sub Char {   
596    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
597    $_[0]->{'Text'} .= $_[1];
598
599    my $self = $_[0]->{'PluginObj'};
600    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
601    $self->{'text'} .= $_[1];
602    $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
603    if($self->{'text'} ne "")
604    {       
605        print STDERR " $self->{'subfield'}:($self->{'text'})\n";
606    }
607    }
608    return undef;
609}
610
611sub OAI_StartTag
612{
613    my ($expat, $element, %attr) = @_;
614
615    my $self = $expat->{'PluginObj'};
616    $self->{'subfield'} = $element;
617   
618}
619
620sub OAI_EndTag
621{
622    my ($expat, $element) = @_;
623
624    my $self = $expat->{'PluginObj'};
625    $self->{'text'} = "";
626    $self->{'subfield'} = "";
627}
628
629sub error
630{
631    my ($self,$strFunctionName,$strError) = @_;
632    {
633    print "Error occoured in OAIDownload.pm\n".
634        "In Function:".$strFunctionName."\n".
635        "Error Message:".$strError."\n";
636    exit(-1);
637    }
638}
639
640
641
6421;
Note: See TracBrowser for help on using the browser.