root/main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm @ 28250

Revision 28250, 17.6 KB (checked in by ak19, 7 years ago)

1. Jenny and I fixed an oversight in OAIDownload, thanks to Kathy's suggestion, where an html page's contents used to be inspected for file types specified to be downloaded with GetDocument? ONLY if the file extension was htm(l). Now the test is whether the header specifies Content-Type text/html. 2. Deprecated utils functions replaced with their FileUtils? equivalents.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43    @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47    [ { 'name' => "url",
48    'disp' => "{OAIDownload.url_disp}",
49    'desc' => "{OAIDownload.url}",
50    'type' => "string",
51    'reqd' => "yes"},
52      { 'name' => "metadata_prefix",
53    'disp' => "{OAIDownload.metadata_prefix_disp}",
54    'desc' => "{OAIDownload.metadata_prefix}",
55    'type' => "string",
56    'deft' => "oai_dc",
57    'reqd' => "no"},
58      { 'name' => "set",
59    'disp' => "{OAIDownload.set_disp}",
60    'desc' => "{OAIDownload.set}",
61    'type' => "string",
62    'reqd' => "no"},
63      { 'name' => "get_doc",
64    'disp' => "{OAIDownload.get_doc_disp}",
65    'desc' => "{OAIDownload.get_doc}",
66    'type' => "flag",
67    'reqd' => "no"},
68      { 'name' => "get_doc_exts",
69    'disp' => "{OAIDownload.get_doc_exts_disp}",
70    'desc' => "{OAIDownload.get_doc_exts}",
71    'type' => "string",
72    'deft' => "doc,pdf,ppt",
73    'reqd' => "no"},
74      { 'name' => "max_records",
75    'disp' => "{OAIDownload.max_records_disp}",
76    'desc' => "{OAIDownload.max_records}",
77    'type' => "int",
78    'range' => "1,",
79    'reqd' => "no"} ];
80
81my $options = { 'name'     => "OAIDownload",
82        'desc'     => "{OAIDownload.desc}",
83        'abstract' => "no",
84        'inherits' => "yes",
85        'args'     => $arguments };
86
87sub new
88{
89    my ($class) = shift (@_);
90    my ($getlist,$inputargs,$hashArgOptLists) = @_;
91    push(@$getlist, $class);
92
93    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94    push(@{$hashArgOptLists->{"OptList"}},$options);
95
96    my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
97
98    if ($self->{'info_only'}) {
99    # don't worry about any options etc
100    return bless $self, $class;
101    }
102
103    my $parser = new XML::Parser('Style' => 'Stream',
104                                 'PluginObj' => $self,
105                 'Handlers' => {'Char' => \&Char,
106                        'Start' => \&OAI_StartTag,
107                        'End' => \&OAI_EndTag
108                        });
109    $self->{'parser'} = $parser;
110   
111    # make sure the tmp directory that we will use later exists
112    my $tmp_dir = "$ENV{GSDLHOME}/tmp";
113    if (! -e $tmp_dir) {
114    &FileUtils::makeDirectory($tmp_dir);
115    }
116
117    # if max_records not specified, parsing will have set it to ""
118    undef $self->{'max_records'} if $self->{'max_records'} eq "";
119
120    # set up hashmap for individual items in get_doc_exts
121    # to make testing for matches easier
122
123    $self->{'lookup_exts'} = {};
124    my $get_doc_exts = $self->{'get_doc_exts'};
125   
126    if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
127    my @exts = split(/,\s*/,$get_doc_exts);
128    foreach my $e (@exts) {
129        $self->{'lookup_exts'}->{lc($e)} = 1;
130    }
131    }
132
133
134    return bless $self, $class;
135}
136
137sub download
138{
139    my ($self) = shift (@_);
140    my ($hashGeneralOptions) = @_;
141
142##    my $cmdWget = $strWgetOptions;
143 
144    my $strOutputDir ="";
145    $strOutputDir = $hashGeneralOptions->{"cache_dir"};
146    my $strBasURL = $self->{'url'};
147    my $blnDownloadDoc = $self->{'get_doc'};
148
149    print STDERR "<<Defined Maximum>>\n";
150
151    my $strIDs = $self->getOAIIDs($strBasURL);
152 
153    if($strIDs eq "")
154    {
155    print STDERR "Error: No IDs found\n";
156    return 0;
157    }
158
159    my $aryIDs = $self->parseOAIIDs($strIDs);
160    my $intIDs = 0;
161    if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
162    {
163    $intIDs = $self->{'max_records'};
164    }
165    else
166    {
167    $intIDs = scalar(@$aryIDs);
168    }
169    print STDERR "<<Total number of record(s):$intIDs>>\n";
170
171    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
172
173#    my $tmp_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"tmp","oai.tmp");
174#    &FileUtils::removeFiles($tmp_file);
175
176    return 1;
177}
178
179sub getOAIIDs
180{
181    my ($self,$strBasURL) = @_;
182##    my ($cmdWget);
183     
184    my $wgetOptions = $self->getWgetOptions();
185
186    my $cmdWget = $wgetOptions;
187 
188    print STDERR  "Gathering OAI identifiers.....\n";
189   
190    my $metadata_prefix = $self->{'metadata_prefix'};
191    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
192
193    # if $set specified, add it in to URL
194    my $set = $self->{'set'};
195    $cmdWget .= "&set=$set" if ($set ne "");
196
197    $cmdWget .= "\" ";
198
199    my $accumulated_strIDs = "";
200    my $strIDs =  $self->useWget($cmdWget);
201
202    if (!defined $strIDs or $strIDs eq ""  ){
203    print STDERR "Server information is unavailable.\n";
204    print STDERR "<<Finished>>\n";
205        return; 
206    }
207    if ($self->{'forced_quit'}) {
208    return $strIDs;
209    }
210
211    print STDERR "<<Download Information>>\n";
212   
213    $self->parse_xml($strIDs);
214
215    $accumulated_strIDs = $strIDs;
216    my $max_recs = $self->{'max_records'};
217    while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
218    # top up list with further requests for IDs
219
220    my $resumption_token = $1;
221
222    $cmdWget = $wgetOptions;
223
224    $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
225
226    $strIDs =  $self->useWget($cmdWget);
227    if ($self->{'forced_quit'}) {
228        return $accumulated_strIDs;
229    }
230
231    $self->parse_xml($strIDs);
232
233    $accumulated_strIDs .= $strIDs;
234
235    my @accumulated_identifiers
236        = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
237
238    my $num_acc_identifiers = scalar(@accumulated_identifiers);
239    if (defined  $max_recs && $num_acc_identifiers > $max_recs ) {
240        last;
241    }
242    }
243
244    return $accumulated_strIDs;
245}
246
247sub parseOAIIDs
248{   
249    my ($self,$strIDs) = @_;
250
251    print STDERR "Parsing OAI identifiers.....\n";
252    $strIDs =~ s/^.*?<identifier>/<identifier>/s;
253    $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
254
255    my @aryIDs = ();
256
257    while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
258    {
259    $strIDs = $2;
260    push(@aryIDs,$1);
261    }
262   
263    return \@aryIDs;
264}
265
266sub dirFileSplit
267{
268    my ($self,$strFile) = @_;
269
270    my @aryDirs = split("[/\]",$strFile);
271   
272    my $strLocalFile = pop(@aryDirs);
273    my $strSubDirs = join("/",@aryDirs);
274
275    return ($strSubDirs,$strLocalFile);
276}
277
278sub getOAIDoc
279{
280    my ($self,$strRecord, $oai_rec_filename) = @_;
281 
282    print  STDERR "Gathering source documents.....\n";
283    # look out for identifier tag in metadata section
284   
285    if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
286    {
287    my $strMetaTag = $1;
288    my $had_valid_url = 0;
289    my $count = 1;
290    while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
291    {
292        my $doc_id_url = $2;
293        print STDERR "Found doc url: $doc_id_url\n";
294        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
295
296        my $orig_doc_id_url = $doc_id_url;
297        $had_valid_url = 1;
298
299        my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
300        my $faked_ext = 0;
301        my $primary_doc_match = 0;
302
303        my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
304
305        if (defined $id_file_ext) {
306        # cross-check this filename extension with get_doc_exts option
307        # if provided
308        my $lookup_exts = $self->{'lookup_exts'};
309
310        if (defined $lookup_exts->{lc($id_file_ext)}) {
311            # this initial URL matches requirement
312            $primary_doc_match = 1;
313        }
314        }
315        else {
316        $faked_ext = 1;
317        $id_file_ext = "html";
318        }
319
320
321        my $is_page_html = 0;
322        if($id_file_ext =~ m/^html?$/i) {
323            $is_page_html = 1;
324        } elsif ($doc_id_url) { # important: if no doc id has currently been typed into the url field, skip this block
325       
326            # get the page and check the header's content-type and see if this is text/html
327            # if so, $is_page_html is true
328            # See http://superuser.com/questions/197009/wget-head-request
329           
330            my $wget_opts3 = $self->getWgetOptions();
331            my $wget_header_cmd = "$wget_opts3 -S --spider \"$doc_id_url\"";
332            my $page_content = $self->useWget($wget_header_cmd);           
333           
334            if($page_content && $page_content =~ m@Content-Type:\s*text/html@i) {               
335                $is_page_html = 1;
336            }
337        }
338       
339        if (!$primary_doc_match && $is_page_html) {     
340       
341        # Download this doc if HTML, scan through it looking for a link
342        # that does match get_doc_exts
343       
344
345        # 1. Generate a tmp name
346        my $tmp_filename = &util::get_tmp_filename();
347
348        # 2. Download it
349        my $wget_opts2 = $self->getWgetOptions();
350        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
351        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
352        return $strRecord if $self->{'forced_quit'};
353
354        if($error2 ne "")
355        {
356            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
357            #exit(-1);
358            next;
359        }
360
361        if (defined $follow2) {
362            # src url was "redirected" to another place
363            # => pick up on this and make it the new doc_id_url
364            $doc_id_url = $follow2;
365        }
366
367        my $primary_doc_html = "";
368        if (open(HIN,"<$tmp_filename")) {
369            my $line;
370            while (defined ($line = <HIN>)) {
371            $primary_doc_html .= $line;
372            }
373            close(HIN);
374
375            # 3. Scan through it looking for match
376            #
377            # if got match, change $doc_id_url to this new URL and
378            # $id_file_ext to 'match'
379           
380            my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
381
382            my $lookup_exts = $self->{'lookup_exts'};
383
384            foreach my $href (@href_links) {
385            my ($ext) = ($href =~ m/\.([^\.]+)$/);
386
387            if ((defined $ext) && (defined $lookup_exts->{$ext})) {
388
389                if ($href !~ m/^(https?|ftp):\/\//) {
390                # link is within current site
391                my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
392
393                $href = "$site_domain$href";
394                }
395
396                $doc_id_url = $href;
397                $id_file_ext = $ext;
398                last;
399            }
400            }
401        }
402        else {
403            print STDERR "Error occurred while retrieving OAI source documents (2):\n";
404            print STDERR "$!\n";
405        }
406
407        if (-e $tmp_filename) {
408            &FileUtils::removeFiles($tmp_filename);
409        }
410        }
411
412        my $download_doc_filename = $oai_rec_filename;
413        # As requested by John Rose, don't suffix -$count for the very first document (so no "-1" suffix):
414        my $new_extension = ($count > 1) ? "\-$count\.$id_file_ext" : "\.$id_file_ext";
415        $count++;
416        #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
417        $download_doc_filename =~ s/\.oai$/$new_extension/;
418        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
419
420        # may have &apos; in url - others??
421        my $safe_doc_id_url = $doc_id_url;
422        $safe_doc_id_url =~ s/&apos;/\'/g;
423
424        my $wget_opts = $self->getWgetOptions();
425        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
426       
427        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
428        return $strRecord if $self->{'forced_quit'};
429
430        if($errors ne "")
431        {
432        print STDERR "Error occured while retriving OAI souce documents (3):\n";
433        print STDERR "$errors\n";
434        #exit(-1);
435        next;
436        }
437
438       
439        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
440    }
441
442    if (!$had_valid_url)
443    {
444        print  STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
445    }
446    }
447    else
448    {
449    print  STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
450    }
451
452    return $strRecord;
453}
454
455sub getOAIRecords
456{
457    my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
458    my $intDocCounter = 0;
459
460    my $metadata_prefix = $self->{'metadata_prefix'};
461
462    foreach my $strID ( @$aryIDs)
463    {
464    print  STDERR "Gathering OAI record with ID $strID.....\n";
465       
466    my $wget_opts = $self->getWgetOptions();
467    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
468   
469    my $strRecord =  $self->useWget($cmdWget);
470
471    my @fileDirs = split(":",$strID); 
472    my $local_id = pop @fileDirs;
473
474    # setup directories
475
476        $strOutputDir  =~ s/"//g; #"
477
478        my $host =$self->{'url'};
479 
480        $host =~ s@https?:\/\/@@g;
481
482        $host =~ s/:.*//g;
483
484    my $strFileURL = "";
485    if ($strOutputDir ne "") {
486        $strFileURL = "$strOutputDir/";
487    }
488    $strFileURL .= "$host/$local_id.oai";
489
490    # prepare subdirectory for record (if needed)
491    my ($strSubDirPath,$unused) = ("", "");
492
493        ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
494   
495    &FileUtils::makeAllDirectories($strSubDirPath);
496
497    my $ds = &util::get_dirsep();
498   
499    if($blnDownloadDoc)
500    {
501        $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
502    }
503
504    # save record
505    open (OAIOUT,">$strFileURL")
506        || die "Unable to save oai metadata record: $!\n";
507    print OAIOUT $strRecord;
508    close(OAIOUT);
509
510        print STDERR "Saving record to $strFileURL\n";
511        print STDERR "<<Done>>\n";
512    $intDocCounter ++; 
513    last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
514    }
515
516    (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
517    print  STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
518    print  STDERR "Complete download meta record from $strBasURL\n";
519
520       print STDERR "<<Finished>>\n";
521}
522
523sub url_information
524{
525    my ($self) = shift (@_);
526    if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
527   
528    my $wgetOptions = $self->getWgetOptions();
529    my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
530 
531    my $strIdentify = "verb=Identify";
532    my $strListSets = "verb=ListSets";
533    my $strListMdFormats = "verb=ListMetadataFormats";
534
535    my $strIdentifyCMD = $strBaseCMD;
536    $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 
537
538    my $strIdentifyText = $self->useWget($strIdentifyCMD);
539
540     if (!defined $strIdentifyText or $strIdentifyText eq ""  ){
541    print STDERR "Server information is unavailable.\n";
542    print STDERR "<<Finished>>\n";
543        return; 
544    }
545
546    print STDERR "General information:\n";
547    $self->parse_xml($strIdentifyText);
548    print STDERR "\n";
549
550    print STDERR "=" x 10, "\n";
551    print STDERR "Metadata Format Information (metadataPrefix):\n";
552    print STDERR "=" x 10, "\n";
553
554    my $strListMdFormatsCMD = $strBaseCMD;
555    $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;   
556    my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
557
558    $self->parse_xml($strListMdFormatsText);
559    print STDERR "\n";
560
561    print STDERR "=" x 10, "\n";
562    print STDERR "List Information:\n";
563    print STDERR "=" x 10, "\n";
564
565    my $strListSetCMD = $strBaseCMD;
566    $strListSetCMD =~ s/_OPTS_/$strListSets/;   
567    my $strListSetsText = $self->useWget($strListSetCMD);
568
569    $self->parse_xml($strListSetsText);
570}
571
572sub parse_xml
573{   
574    my ($self) = shift (@_);
575    my ($xml_text) = @_;
576   
577    #### change this to work directly from $xml_text
578
579    #Open a temporary file to store OAI information, and store the information to the temp file
580    my $name = &FileUtils::filenameConcatenate($ENV{GSDLHOME},"tmp","oai.tmp");
581
582    open(*OAIOUT,"> $name");
583   
584    print OAIOUT $xml_text;
585    close(OAIOUT);
586
587    $self->{'temp_file_name'} = $name;
588
589##    print STDERR "**** xml text = $xml_text\n";
590
591    eval {
592    $self->{'parser'}->parsefile("$name");
593##  $self->{'parser'}->parse($xml_text);
594    };
595
596    if ($@) {
597    die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
598##  die "OAI: Parsed text is not a well formed XML file ($@)\n";
599    }
600
601    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
602}
603
604####END
605#{
606#    if($self->{'info'})
607#    {
608#   unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
609#    }
610#}
611
612# This Char function overrides the one in XML::Parser::Stream to overcome a
613# problem where $expat->{Text} is treated as the return value, slowing
614# things down significantly in some cases.
615sub Char {   
616    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
617    $_[0]->{'Text'} .= $_[1];
618
619    my $self = $_[0]->{'PluginObj'};
620    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
621    $self->{'text'} .= $_[1];
622    $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
623    if($self->{'text'} ne "")
624    {       
625        print STDERR " $self->{'subfield'}:($self->{'text'})\n";
626    }
627    }
628    return undef;
629}
630
631sub OAI_StartTag
632{
633    my ($expat, $element, %attr) = @_;
634
635    my $self = $expat->{'PluginObj'};
636    $self->{'subfield'} = $element;
637   
638}
639
640sub OAI_EndTag
641{
642    my ($expat, $element) = @_;
643
644    my $self = $expat->{'PluginObj'};
645    $self->{'text'} = "";
646    $self->{'subfield'} = "";
647}
648
649sub error
650{
651    my ($self,$strFunctionName,$strError) = @_;
652    {
653    print "Error occoured in OAIDownload.pm\n".
654        "In Function:".$strFunctionName."\n".
655        "Error Message:".$strError."\n";
656    exit(-1);
657    }
658}
659
660
661
6621;
Note: See TracBrowser for help on using the browser.