root/main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm @ 25199

Revision 25199, 16.5 KB (checked in by ak19, 9 years ago)

DLConsulting fixed a bug in a regex that prevented OAIDownload.pm from working properly.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43    @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{OAIDownload.url_disp}",
49    'desc' => "{OAIDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "metadata_prefix",
53    'disp' => "{OAIDownload.metadata_prefix_disp}",
54    'desc' => "{OAIDownload.metadata_prefix}",
55    'type' => "string",
56    'deft' => "oai_dc",
57    'reqd' => "no"},
58      { 'name' => "set",
59    'disp' => "{OAIDownload.set_disp}",
60    'desc' => "{OAIDownload.set}",
61    'type' => "string",
62    'reqd' => "no"},
63      { 'name' => "get_doc",
64    'disp' => "{OAIDownload.get_doc_disp}",
65    'desc' => "{OAIDownload.get_doc}",
66    'type' => "flag",
67    'reqd' => "no"},
68      { 'name' => "get_doc_exts",
69    'disp' => "{OAIDownload.get_doc_exts_disp}",
70    'desc' => "{OAIDownload.get_doc_exts}",
71    'type' => "string",
72    'deft' => "doc,pdf,ppt",
73    'reqd' => "no"},
74      { 'name' => "max_records",
75    'disp' => "{OAIDownload.max_records_disp}",
76    'desc' => "{OAIDownload.max_records}",
77    'type' => "int",
78    'deft' => "500",
79    'range' => "1,",
80    'reqd' => "no"} ];
81
82my $options = { 'name'     => "OAIDownload",
83        'desc'     => "{OAIDownload.desc}",
84        'abstract' => "no",
85        'inherits' => "yes",
86        'args'     => $arguments };
87
88##my $self;
89
90#### my $strWgetOptions="";
91
92sub new
93{
94    my ($class) = shift (@_);
95    my ($getlist,$inputargs,$hashArgOptLists) = @_;
96    push(@$getlist, $class);
97
98    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99    push(@{$hashArgOptLists->{"OptList"}},$options);
100
101    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
102
103    if ($self->{'info_only'}) {
104    # don't worry about any options etc
105    return bless $self, $class;
106    }
107
108    my $parser = new XML::Parser('Style' => 'Stream',
109                                 'PluginObj' => $self,
110                 'Handlers' => {'Char' => \&Char,
111                        'Start' => \&OAI_StartTag,
112                        'End' => \&OAI_EndTag
113                        });
114    $self->{'parser'} = $parser;
115   
116    # make sure the tmp directory that we will use later exists
117    my $tmp_dir = "$ENV{GSDLHOME}/tmp";
118    if (! -e $tmp_dir) {
119    &util::mk_dir($tmp_dir);
120    }
121   
122
123    # set up hashmap for individual items in get_doc_exts
124    # to make testing for matches easier
125
126    $self->{'lookup_exts'} = {};
127    my $get_doc_exts = $self->{'get_doc_exts'};
128
129    if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
130    my @exts = split(/,\s*/,$get_doc_exts);
131    foreach my $e (@exts) {
132        $self->{'lookup_exts'}->{lc($e)} = 1;
133    }
134    }
135
136
137    return bless $self, $class;
138}
139
140sub download
141{
142    my ($self) = shift (@_);
143    my ($hashGeneralOptions) = @_;
144
145##    my $cmdWget = $strWgetOptions;
146 
147    my $strOutputDir ="";
148    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
149    my $strBasURL = $self->{'url'};
150    my $intMaxRecords = $self->{'max_records'};
151    my $blnDownloadDoc = $self->{'get_doc'};
152
153    print STDERR "<<Defined Maximum>>\n";
154
155    my $strIDs = $self->getOAIIDs($strBasURL);
156 
157    if($strIDs eq "")
158    {
159    print STDERR "Error: No IDs found\n";
160    return 0;
161    }
162
163    my $aryIDs = $self->parseOAIIDs($strIDs);
164    my $intIDs = 0;
165    if($self->{'max_records'} < scalar(@$aryIDs))
166    {
167    $intIDs = $self->{'max_records'};
168    }
169    else
170    {
171    $intIDs = scalar(@$aryIDs);
172    }
173    print STDERR "<<Total number of record(s):$intIDs>>\n";
174
175    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
176
177#    my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
178#    &util::rm($tmp_file);
179
180    return 1;
181}
182
183sub getOAIIDs
184{
185    my ($self,$strBasURL) = @_;
186##    my ($cmdWget);
187     
188    my $wgetOptions = $self->getWgetOptions();
189
190    my $cmdWget = $wgetOptions;
191 
192    print STDERR  "Gathering OAI identifiers.....\n";
193
194    my $metadata_prefix = $self->{'metadata_prefix'};
195    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
196
197    # if $set specified, add it in to URL
198    my $set = $self->{'set'};
199    $cmdWget .= "&set=$set" if ($set ne "");
200
201    $cmdWget .= "\" ";
202
203    my $accumulated_strIDs = "";
204    my $strIDs =  $self->useWget($cmdWget);
205
206    if (!defined $strIDs or $strIDs eq ""  ){
207    print STDERR "Server information is unavailable.\n";
208    print STDERR "<<Finished>>\n";
209        return; 
210    }
211    if ($self->{'forced_quit'}) {
212    return $strIDs;
213    }
214
215    print STDERR "<<Download Information>>\n";
216   
217    $self->parse_xml($strIDs);
218
219    $accumulated_strIDs = $strIDs;
220
221    while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
222    # top up list with further requests for IDs
223
224    my $resumption_token = $1;
225
226    $cmdWget = $wgetOptions;
227
228    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
229
230    $strIDs =  $self->useWget($cmdWget);
231    if ($self->{'forced_quit'}) {
232        return $accumulated_strIDs;
233    }
234
235    $self->parse_xml($strIDs);
236
237    $accumulated_strIDs .= $strIDs;
238
239    my @accumulated_identifiers
240        = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
241
242    my $num_acc_identifiers = scalar(@accumulated_identifiers);
243    if ($num_acc_identifiers > $self->{'max_records'}) {
244        last;
245    }
246    }
247
248    return $accumulated_strIDs;
249}
250
251sub parseOAIIDs
252{   
253    my ($self,$strIDs) = @_;
254
255    print STDERR "Parsing OAI identifiers.....\n";
256    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
257    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
258
259    my @aryIDs = ();
260
261    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
262    {
263    $strIDs = $2;
264    push(@aryIDs,$1);
265    }
266   
267    return \@aryIDs;
268}
269
270sub dirFileSplit
271{
272    my ($self,$strFile) = @_;
273
274    my @aryDirs = split("[/\]",$strFile);
275   
276    my $strLocalFile = pop(@aryDirs);
277    my $strSubDirs = join("/",@aryDirs);
278
279    return ($strSubDirs,$strLocalFile);
280}
281
282sub getOAIDoc
283{
284    my ($self,$strRecord, $oai_rec_filename) = @_;
285 
286    print  STDERR "Gathering source documents.....\n";
287    # look out for identifier tag in metadata section
288   
289    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
290    {
291    my $strMetaTag = $1;
292    my $had_valid_url = 0;
293    my $count = 1;
294    while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
295    {
296        my $doc_id_url = $2;
297        print STDERR "Found doc url: $doc_id_url\n";
298        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
299
300        my $orig_doc_id_url = $doc_id_url;
301        $had_valid_url = 1;
302
303        my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
304        my $faked_ext = 0;
305        my $primary_doc_match = 0;
306
307        my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
308
309        if (defined $id_file_ext) {
310        # cross-check this filename extension with get_doc_exts option
311        # if provided
312        my $lookup_exts = $self->{'lookup_exts'};
313
314        if (defined $lookup_exts->{lc($id_file_ext)}) {
315            # this initial URL matches requirement
316            $primary_doc_match = 1;
317        }
318        }
319        else {
320        $faked_ext = 1;
321        $id_file_ext = "html";
322        }
323
324       
325        if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
326        # Download this doc if HTML, scan through it looking for a link
327        # that does match get_doc_exts
328       
329
330        # 1. Generate a tmp name
331        my $tmp_filename = &util::get_tmp_filename();
332
333        # 2. Download it
334        my $wget_opts2 = $self->getWgetOptions();
335        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
336        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
337        return $strRecord if $self->{'forced_quit'};
338
339        if($error2 ne "")
340        {
341            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
342            #exit(-1);
343            next;
344        }
345
346        if (defined $follow2) {
347            # src url was "redirected" to another place
348            # => pick up on this and make it the new doc_id_url
349            $doc_id_url = $follow2;
350        }
351
352        my $primary_doc_html = "";
353        if (open(HIN,"<$tmp_filename")) {
354            my $line;
355            while (defined ($line = <HIN>)) {
356            $primary_doc_html .= $line;
357            }
358            close(HIN);
359
360            # 3. Scan through it looking for match
361            #
362            # if got match, change $doc_id_url to this new URL and
363            # $id_file_ext to 'match'
364           
365            my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
366
367            my $lookup_exts = $self->{'lookup_exts'};
368
369            foreach my $href (@href_links) {
370            my ($ext) = ($href =~ m/\.([^\.]+)$/);
371
372            if ((defined $ext) && (defined $lookup_exts->{$ext})) {
373
374                if ($href !~ m/^(https?|ftp):\/\//) {
375                # link is within current site
376                my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
377
378                $href = "$site_domain$href";
379                }
380
381                $doc_id_url = $href;
382                $id_file_ext = $ext;
383                last;
384            }
385            }
386        }
387        else {
388            print STDERR "Error occurred while retrieving OAI source documents (2):\n";
389            print STDERR "$!\n";
390        }
391
392        if (-e $tmp_filename) {
393            &util::rm($tmp_filename);
394        }
395        }
396
397        my $download_doc_filename = $oai_rec_filename;
398        my $new_extension = "\-$count\.$id_file_ext";
399        $count++;
400        #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
401        $download_doc_filename =~ s/\.oai$/$new_extension/;
402        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
403
404        # may have &apos; in url - others??
405        my $safe_doc_id_url = $doc_id_url;
406        $safe_doc_id_url =~ s/&apos;/\'/g;
407
408        my $wget_opts = $self->getWgetOptions();
409        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
410       
411        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
412        return $strRecord if $self->{'forced_quit'};
413
414        if($errors ne "")
415        {
416        print STDERR "Error occured while retriving OAI souce documents (3):\n";
417        print STDERR "$errors\n";
418        #exit(-1);
419        next;
420        }
421
422       
423        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
424    }
425
426    if (!$had_valid_url)
427    {
428        print  STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
429    }
430    }
431    else
432    {
433    print  STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
434    }
435
436    return $strRecord;
437}
438
439sub getOAIRecords
440{
441    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
442
443    my $intDocCounter = 0;
444
445    my $metadata_prefix = $self->{'metadata_prefix'};
446
447    foreach my $strID ( @$aryIDs)
448    {
449    print  STDERR "Gathering OAI record with ID $strID.....\n";
450       
451    my $wget_opts = $self->getWgetOptions();
452    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
453   
454    my $strRecord =  $self->useWget($cmdWget);
455
456    my @fileDirs = split(":",$strID); 
457    my $local_id = pop @fileDirs;
458
459    # setup directories
460
461        $strOutputDir  =~ s/"//g; #"
462
463        my $host =$self->{'url'};
464 
465        $host =~ s/https?:\/\///g;
466
467        $host =~ s/:.*//g;
468
469    my $strFileURL = "$strOutputDir/$host/$local_id.oai";
470
471
472    # prepare subdirectory for record (if needed)
473    my ($strSubDirPath,$unused) = ("", "");
474
475        ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
476   
477    &util::mk_all_dir($strSubDirPath);
478
479    my $ds = &util::get_dirsep();
480   
481    if($blnDownloadDoc)
482    {
483        $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
484    }
485
486    # save record
487    open (OAIOUT,">$strFileURL")
488        || die "Unable to save oai metadata record: $!\n";
489    print OAIOUT $strRecord;
490    close(OAIOUT);
491
492        print STDERR "Saving records to $strFileURL\n";
493        print STDERR "<<Done>>\n";
494    $intDocCounter ++; 
495    last if ($intDocCounter >= $intMaxRecords);
496    }
497
498    ($intDocCounter >= $intMaxRecords) ?
499    print  STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
500    print  STDERR "Complete download meta record from $strBasURL\n";
501
502       print STDERR "<<Finished>>\n";
503}
504
505sub url_information
506{
507    my ($self) = shift (@_);
508    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
509   
510    my $wgetOptions = $self->getWgetOptions();
511    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
512 
513    my $strIdentify = "verb=Identify";
514    my $strListSets = "verb=ListSets";
515    my $strListMdFormats = "verb=ListMetadataFormats";
516
517    my $strIdentifyCMD = $strBaseCMD;
518    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 
519
520    my $strIdentifyText = $self->useWget($strIdentifyCMD);
521
522     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
523    print STDERR "Server information is unavailable.\n";
524    print STDERR "<<Finished>>\n";
525        return; 
526    }
527
528    print STDERR "General information:\n";
529    $self->parse_xml($strIdentifyText);
530    print STDERR "\n";
531
532    print STDERR "=" x 10, "\n";
533    print STDERR "Metadata Format Information (metadataPrefix):\n";
534    print STDERR "=" x 10, "\n";
535
536    my $strListMdFormatsCMD = $strBaseCMD;
537    $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;   
538    my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
539
540    $self->parse_xml($strListMdFormatsText);
541    print STDERR "\n";
542
543    print STDERR "=" x 10, "\n";
544    print STDERR "List Information:\n";
545    print STDERR "=" x 10, "\n";
546
547    my $strListSetCMD = $strBaseCMD;
548    $strListSetCMD =~ s/_OPTS_/$strListSets/;   
549    my $strListSetsText = $self->useWget($strListSetCMD);
550
551    $self->parse_xml($strListSetsText);
552}
553
554sub parse_xml
555{   
556    my ($self) = shift (@_);
557    my ($xml_text) = @_;
558   
559    #### change this to work directly from $xml_text
560
561    #Open a temporary file to store OAI information, and store the information to the temp file
562    my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
563
564    open(*OAIOUT,"> $name");
565   
566    print OAIOUT $xml_text;
567    close(OAIOUT);
568
569    $self->{'temp_file_name'} = $name;
570
571##    print STDERR "**** xml text = $xml_text\n";
572
573    eval {
574    $self->{'parser'}->parsefile("$name");
575##  $self->{'parser'}->parse($xml_text);
576    };
577   
578    if ($@) {
579    die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
580##  die "OAI: Parsed text is not a well formed XML file ($@)\n";
581    }
582
583    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
584}
585
586####END
587#{
588#    if($self->{'info'})
589#    {
590#   unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
591#    }
592#}
593
594# This Char function overrides the one in XML::Parser::Stream to overcome a
595# problem where $expat->{Text} is treated as the return value, slowing
596# things down significantly in some cases.
597sub Char {   
598    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
599    $_[0]->{'Text'} .= $_[1];
600
601    my $self = $_[0]->{'PluginObj'};
602    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
603    $self->{'text'} .= $_[1];
604    $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
605    if($self->{'text'} ne "")
606    {       
607        print STDERR " $self->{'subfield'}:($self->{'text'})\n";
608    }
609    }
610    return undef;
611}
612
613sub OAI_StartTag
614{
615    my ($expat, $element, %attr) = @_;
616
617    my $self = $expat->{'PluginObj'};
618    $self->{'subfield'} = $element;
619   
620}
621
622sub OAI_EndTag
623{
624    my ($expat, $element) = @_;
625
626    my $self = $expat->{'PluginObj'};
627    $self->{'text'} = "";
628    $self->{'subfield'} = "";
629}
630
631sub error
632{
633    my ($self,$strFunctionName,$strError) = @_;
634    {
635    print "Error occoured in OAIDownload.pm\n".
636        "In Function:".$strFunctionName."\n".
637        "Error Message:".$strError."\n";
638    exit(-1);
639    }
640}
641
642
643
6441;
Note: See TracBrowser for help on using the browser.