root/main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm @ 32113

Revision 32113, 17.5 KB (checked in by kjdon, 3 years ago)

tmpnam isn't being used anyway, and it is deprecated in perl 5.24, removed in 5.26. so have removed it.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use util;
40
41sub BEGIN {
42    @OAIDownload::ISA = ('WgetDownload');
43}
44
45my $arguments =
46    [ { 'name' => "url",
47    'disp' => "{OAIDownload.url_disp}",
48    'desc' => "{OAIDownload.url}",
49    'type' => "string",
50    'reqd' => "yes"},
51      { 'name' => "metadata_prefix",
52    'disp' => "{OAIDownload.metadata_prefix_disp}",
53    'desc' => "{OAIDownload.metadata_prefix}",
54    'type' => "string",
55    'deft' => "oai_dc",
56    'reqd' => "no"},
57      { 'name' => "set",
58    'disp' => "{OAIDownload.set_disp}",
59    'desc' => "{OAIDownload.set}",
60    'type' => "string",
61    'reqd' => "no"},
62      { 'name' => "get_doc",
63    'disp' => "{OAIDownload.get_doc_disp}",
64    'desc' => "{OAIDownload.get_doc}",
65    'type' => "flag",
66    'reqd' => "no"},
67      { 'name' => "get_doc_exts",
68    'disp' => "{OAIDownload.get_doc_exts_disp}",
69    'desc' => "{OAIDownload.get_doc_exts}",
70    'type' => "string",
71    'deft' => "doc,pdf,ppt",
72    'reqd' => "no"},
73      { 'name' => "max_records",
74    'disp' => "{OAIDownload.max_records_disp}",
75    'desc' => "{OAIDownload.max_records}",
76    'type' => "int",
77    'range' => "1,",
78    'reqd' => "no"} ];
79
80my $options = { 'name'     => "OAIDownload",
81        'desc'     => "{OAIDownload.desc}",
82        'abstract' => "no",
83        'inherits' => "yes",
84        'args'     => $arguments };
85
86sub new
87{
88    my ($class) = shift (@_);
89    my ($getlist,$inputargs,$hashArgOptLists) = @_;
90    push(@$getlist, $class);
91
92    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
93    push(@{$hashArgOptLists->{"OptList"}},$options);
94
95    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
96
97    if ($self->{'info_only'}) {
98    # don't worry about any options etc
99    return bless $self, $class;
100    }
101
102    my $parser = new XML::Parser('Style' => 'Stream',
103                                 'PluginObj' => $self,
104                 'Handlers' => {'Char' => \&Char,
105                        'Start' => \&OAI_StartTag,
106                        'End' => \&OAI_EndTag
107                        });
108    $self->{'parser'} = $parser;
109   
110    # make sure the tmp directory that we will use later exists
111    my $tmp_dir = "$ENV{GSDLHOME}/tmp";
112    if (! -e $tmp_dir) {
113    &FileUtils::makeDirectory($tmp_dir);
114    }
115
116    # if max_records not specified, parsing will have set it to ""
117    undef $self->{'max_records'} if $self->{'max_records'} eq "";
118
119    # set up hashmap for individual items in get_doc_exts
120    # to make testing for matches easier
121
122    $self->{'lookup_exts'} = {};
123    my $get_doc_exts = $self->{'get_doc_exts'};
124   
125    if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
126    my @exts = split(/,\s*/,$get_doc_exts);
127    foreach my $e (@exts) {
128        $self->{'lookup_exts'}->{lc($e)} = 1;
129    }
130    }
131
132
133    return bless $self, $class;
134}
135
136sub download
137{
138    my ($self) = shift (@_);
139    my ($hashGeneralOptions) = @_;
140
141##    my $cmdWget = $strWgetOptions;
142 
143    my $strOutputDir ="";
144    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
145    my $strBasURL = $self->{'url'};
146    my $blnDownloadDoc = $self->{'get_doc'};
147
148    print STDERR "<<Defined Maximum>>\n";
149
150    my $strIDs = $self->getOAIIDs($strBasURL);
151 
152    if($strIDs eq "")
153    {
154    print STDERR "Error: No IDs found\n";
155    return 0;
156    }
157
158    my $aryIDs = $self->parseOAIIDs($strIDs);
159    my $intIDs = 0;
160    if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
161    {
162    $intIDs = $self->{'max_records'};
163    }
164    else
165    {
166    $intIDs = scalar(@$aryIDs);
167    }
168    print STDERR "<<Total number of record(s):$intIDs>>\n";
169
170    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
171
172#    my $tmp_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"tmp","oai.tmp");
173#    &FileUtils::removeFiles($tmp_file);
174
175    return 1;
176}
177
178sub getOAIIDs
179{
180    my ($self,$strBasURL) = @_;
181##    my ($cmdWget);
182     
183    my $wgetOptions = $self->getWgetOptions();
184
185    my $cmdWget = $wgetOptions;
186 
187    print STDERR  "Gathering OAI identifiers.....\n";
188   
189    my $metadata_prefix = $self->{'metadata_prefix'};
190    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
191
192    # if $set specified, add it in to URL
193    my $set = $self->{'set'};
194    $cmdWget .= "&set=$set" if ($set ne "");
195
196    $cmdWget .= "\" ";
197
198    my $accumulated_strIDs = "";
199    my $strIDs =  $self->useWget($cmdWget);
200
201    if (!defined $strIDs or $strIDs eq ""  ){
202    print STDERR "Server information is unavailable.\n";
203    print STDERR "<<Finished>>\n";
204        return; 
205    }
206    if ($self->{'forced_quit'}) {
207    return $strIDs;
208    }
209
210    print STDERR "<<Download Information>>\n";
211   
212    $self->parse_xml($strIDs);
213
214    $accumulated_strIDs = $strIDs;
215    my $max_recs = $self->{'max_records'};
216    while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
217    # top up list with further requests for IDs
218
219    my $resumption_token = $1;
220
221    $cmdWget = $wgetOptions;
222
223    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
224
225    $strIDs =  $self->useWget($cmdWget);
226    if ($self->{'forced_quit'}) {
227        return $accumulated_strIDs;
228    }
229
230    $self->parse_xml($strIDs);
231
232    $accumulated_strIDs .= $strIDs;
233
234    my @accumulated_identifiers
235        = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
236
237    my $num_acc_identifiers = scalar(@accumulated_identifiers);
238    if (defined  $max_recs && $num_acc_identifiers > $max_recs ) {
239        last;
240    }
241    }
242
243    return $accumulated_strIDs;
244}
245
246sub parseOAIIDs
247{   
248    my ($self,$strIDs) = @_;
249
250    print STDERR "Parsing OAI identifiers.....\n";
251    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
252    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
253
254    my @aryIDs = ();
255
256    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
257    {
258    $strIDs = $2;
259    push(@aryIDs,$1);
260    }
261   
262    return \@aryIDs;
263}
264
265sub dirFileSplit
266{
267    my ($self,$strFile) = @_;
268
269    my @aryDirs = split("[/\]",$strFile);
270   
271    my $strLocalFile = pop(@aryDirs);
272    my $strSubDirs = join("/",@aryDirs);
273
274    return ($strSubDirs,$strLocalFile);
275}
276
277sub getOAIDoc
278{
279    my ($self,$strRecord, $oai_rec_filename) = @_;
280 
281    print  STDERR "Gathering source documents.....\n";
282    # look out for identifier tag in metadata section
283   
284    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
285    {
286    my $strMetaTag = $1;
287    my $had_valid_url = 0;
288    my $count = 1;
289    while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
290    {
291        my $doc_id_url = $2;
292        print STDERR "Found doc url: $doc_id_url\n";
293        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
294
295        my $orig_doc_id_url = $doc_id_url;
296        $had_valid_url = 1;
297
298        my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
299        my $faked_ext = 0;
300        my $primary_doc_match = 0;
301
302        my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
303
304        if (defined $id_file_ext) {
305        # cross-check this filename extension with get_doc_exts option
306        # if provided
307        my $lookup_exts = $self->{'lookup_exts'};
308
309        if (defined $lookup_exts->{lc($id_file_ext)}) {
310            # this initial URL matches requirement
311            $primary_doc_match = 1;
312        }
313        }
314        else {
315        $faked_ext = 1;
316        $id_file_ext = "html";
317        }
318
319
320        my $is_page_html = 0;
321        if($id_file_ext =~ m/^html?$/i) {
322            $is_page_html = 1;
323        } elsif ($doc_id_url) { # important: if no doc id has currently been typed into the url field, skip this block
324       
325            # get the page and check the header's content-type and see if this is text/html
326            # if so, $is_page_html is true
327            # See http://superuser.com/questions/197009/wget-head-request
328           
329            my $wget_opts3 = $self->getWgetOptions();
330            my $wget_header_cmd = "$wget_opts3 -S --spider \"$doc_id_url\"";
331            my $page_content = $self->useWget($wget_header_cmd);           
332           
333            if($page_content && $page_content =~ m@Content-Type:\s*text/html@i) {               
334                $is_page_html = 1;
335            }
336        }
337       
338        if (!$primary_doc_match && $is_page_html) {     
339       
340        # Download this doc if HTML, scan through it looking for a link
341        # that does match get_doc_exts
342       
343
344        # 1. Generate a tmp name
345        my $tmp_filename = &util::get_tmp_filename();
346
347        # 2. Download it
348        my $wget_opts2 = $self->getWgetOptions();
349        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
350        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
351        return $strRecord if $self->{'forced_quit'};
352
353        if($error2 ne "")
354        {
355            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
356            #exit(-1);
357            next;
358        }
359
360        if (defined $follow2) {
361            # src url was "redirected" to another place
362            # => pick up on this and make it the new doc_id_url
363            $doc_id_url = $follow2;
364        }
365
366        my $primary_doc_html = "";
367        if (open(HIN,"<$tmp_filename")) {
368            my $line;
369            while (defined ($line = <HIN>)) {
370            $primary_doc_html .= $line;
371            }
372            close(HIN);
373
374            # 3. Scan through it looking for match
375            #
376            # if got match, change $doc_id_url to this new URL and
377            # $id_file_ext to 'match'
378           
379            my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
380
381            my $lookup_exts = $self->{'lookup_exts'};
382
383            foreach my $href (@href_links) {
384            my ($ext) = ($href =~ m/\.([^\.]+)$/);
385
386            if ((defined $ext) && (defined $lookup_exts->{$ext})) {
387
388                if ($href !~ m/^(https?|ftp):\/\//) {
389                # link is within current site
390                my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
391
392                $href = "$site_domain$href";
393                }
394
395                $doc_id_url = $href;
396                $id_file_ext = $ext;
397                last;
398            }
399            }
400        }
401        else {
402            print STDERR "Error occurred while retrieving OAI source documents (2):\n";
403            print STDERR "$!\n";
404        }
405
406        if (-e $tmp_filename) {
407            &FileUtils::removeFiles($tmp_filename);
408        }
409        }
410
411        my $download_doc_filename = $oai_rec_filename;
412        # As requested by John Rose, don't suffix -$count for the very first document (so no "-1" suffix):
413        my $new_extension = ($count > 1) ? "\-$count\.$id_file_ext" : "\.$id_file_ext";
414        $count++;
415        #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
416        $download_doc_filename =~ s/\.oai$/$new_extension/;
417        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
418
419        # may have &apos; in url - others??
420        my $safe_doc_id_url = $doc_id_url;
421        $safe_doc_id_url =~ s/&apos;/\'/g;
422
423        my $wget_opts = $self->getWgetOptions();
424        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
425       
426        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
427        return $strRecord if $self->{'forced_quit'};
428
429        if($errors ne "")
430        {
431        print STDERR "Error occured while retriving OAI souce documents (3):\n";
432        print STDERR "$errors\n";
433        #exit(-1);
434        next;
435        }
436
437       
438        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
439    }
440
441    if (!$had_valid_url)
442    {
443        print  STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
444    }
445    }
446    else
447    {
448    print  STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
449    }
450
451    return $strRecord;
452}
453
454sub getOAIRecords
455{
456    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
457    my $intDocCounter = 0;
458
459    my $metadata_prefix = $self->{'metadata_prefix'};
460
461    foreach my $strID ( @$aryIDs)
462    {
463    print  STDERR "Gathering OAI record with ID $strID.....\n";
464       
465    my $wget_opts = $self->getWgetOptions();
466    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
467   
468    my $strRecord =  $self->useWget($cmdWget);
469
470    my @fileDirs = split(":",$strID); 
471    my $local_id = pop @fileDirs;
472
473    # setup directories
474
475        $strOutputDir  =~ s/"//g; #"
476
477        my $host =$self->{'url'};
478 
479        $host =~ s@https?:\/\/@@g;
480
481        $host =~ s/:.*//g;
482
483    my $strFileURL = "";
484    if ($strOutputDir ne "") {
485        $strFileURL = "$strOutputDir/";
486    }
487    $strFileURL .= "$host/$local_id.oai";
488
489    # prepare subdirectory for record (if needed)
490    my ($strSubDirPath,$unused) = ("", "");
491
492        ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
493   
494    &FileUtils::makeAllDirectories($strSubDirPath);
495
496    my $ds = &util::get_dirsep();
497   
498    if($blnDownloadDoc)
499    {
500        $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
501    }
502
503    # save record
504    open (OAIOUT,">$strFileURL")
505        || die "Unable to save oai metadata record: $!\n";
506    print OAIOUT $strRecord;
507    close(OAIOUT);
508
509        print STDERR "Saving record to $strFileURL\n";
510        print STDERR "<<Done>>\n";
511    $intDocCounter ++; 
512    last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
513    }
514
515    (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
516    print  STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
517    print  STDERR "Complete download meta record from $strBasURL\n";
518
519       print STDERR "<<Finished>>\n";
520}
521
522sub url_information
523{
524    my ($self) = shift (@_);
525    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
526   
527    my $wgetOptions = $self->getWgetOptions();
528    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
529 
530    my $strIdentify = "verb=Identify";
531    my $strListSets = "verb=ListSets";
532    my $strListMdFormats = "verb=ListMetadataFormats";
533
534    my $strIdentifyCMD = $strBaseCMD;
535    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 
536
537    my $strIdentifyText = $self->useWget($strIdentifyCMD);
538
539     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
540    print STDERR "Server information is unavailable.\n";
541    print STDERR "<<Finished>>\n";
542        return; 
543    }
544
545    print STDERR "General information:\n";
546    $self->parse_xml($strIdentifyText);
547    print STDERR "\n";
548
549    print STDERR "=" x 10, "\n";
550    print STDERR "Metadata Format Information (metadataPrefix):\n";
551    print STDERR "=" x 10, "\n";
552
553    my $strListMdFormatsCMD = $strBaseCMD;
554    $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;   
555    my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
556
557    $self->parse_xml($strListMdFormatsText);
558    print STDERR "\n";
559
560    print STDERR "=" x 10, "\n";
561    print STDERR "List Information:\n";
562    print STDERR "=" x 10, "\n";
563
564    my $strListSetCMD = $strBaseCMD;
565    $strListSetCMD =~ s/_OPTS_/$strListSets/;   
566    my $strListSetsText = $self->useWget($strListSetCMD);
567
568    $self->parse_xml($strListSetsText);
569}
570
571sub parse_xml
572{   
573    my ($self) = shift (@_);
574    my ($xml_text) = @_;
575   
576    #### change this to work directly from $xml_text
577
578    #Open a temporary file to store OAI information, and store the information to the temp file
579    my $name = &FileUtils::filenameConcatenate($ENV{GSDLHOME},"tmp","oai.tmp");
580
581    open(*OAIOUT,"> $name");
582   
583    print OAIOUT $xml_text;
584    close(OAIOUT);
585
586    $self->{'temp_file_name'} = $name;
587
588##    print STDERR "**** xml text = $xml_text\n";
589
590    eval {
591    $self->{'parser'}->parsefile("$name");
592##  $self->{'parser'}->parse($xml_text);
593    };
594
595    if ($@) {
596    die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
597##  die "OAI: Parsed text is not a well formed XML file ($@)\n";
598    }
599
600    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
601}
602
603####END
604#{
605#    if($self->{'info'})
606#    {
607#   unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
608#    }
609#}
610
611# This Char function overrides the one in XML::Parser::Stream to overcome a
612# problem where $expat->{Text} is treated as the return value, slowing
613# things down significantly in some cases.
614sub Char {   
615    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
616    $_[0]->{'Text'} .= $_[1];
617
618    my $self = $_[0]->{'PluginObj'};
619    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
620    $self->{'text'} .= $_[1];
621    $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
622    if($self->{'text'} ne "")
623    {       
624        print STDERR " $self->{'subfield'}:($self->{'text'})\n";
625    }
626    }
627    return undef;
628}
629
630sub OAI_StartTag
631{
632    my ($expat, $element, %attr) = @_;
633
634    my $self = $expat->{'PluginObj'};
635    $self->{'subfield'} = $element;
636   
637}
638
639sub OAI_EndTag
640{
641    my ($expat, $element) = @_;
642
643    my $self = $expat->{'PluginObj'};
644    $self->{'text'} = "";
645    $self->{'subfield'} = "";
646}
647
648sub error
649{
650    my ($self,$strFunctionName,$strError) = @_;
651    {
652    print "Error occoured in OAIDownload.pm\n".
653        "In Function:".$strFunctionName."\n".
654        "Error Message:".$strError."\n";
655    exit(-1);
656    }
657}
658
659
660
6611;
Note: See TracBrowser for help on using the browser.