source: main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm@ 28250

Last change on this file since 28250 was 28250, checked in by ak19, 11 years ago
  1. Jenny and I fixed an oversight in OAIDownload, thanks to Kathy's suggestion, where an html page's contents used to be inspected for file types specified to be downloaded with GetDocument ONLY if the file extension was htm(l). Now the test is whether the header specifies Content-Type text/html. 2. Deprecated utils functions replaced with their FileUtils equivalents.
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
64 'disp' => "{OAIDownload.get_doc_disp}",
65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "get_doc_exts",
69 'disp' => "{OAIDownload.get_doc_exts_disp}",
70 'desc' => "{OAIDownload.get_doc_exts}",
71 'type' => "string",
72 'deft' => "doc,pdf,ppt",
73 'reqd' => "no"},
74 { 'name' => "max_records",
75 'disp' => "{OAIDownload.max_records_disp}",
76 'desc' => "{OAIDownload.max_records}",
77 'type' => "int",
78 'range' => "1,",
79 'reqd' => "no"} ];
80
81my $options = { 'name' => "OAIDownload",
82 'desc' => "{OAIDownload.desc}",
83 'abstract' => "no",
84 'inherits' => "yes",
85 'args' => $arguments };
86
87sub new
88{
89 my ($class) = shift (@_);
90 my ($getlist,$inputargs,$hashArgOptLists) = @_;
91 push(@$getlist, $class);
92
93 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94 push(@{$hashArgOptLists->{"OptList"}},$options);
95
96 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
97
98 if ($self->{'info_only'}) {
99 # don't worry about any options etc
100 return bless $self, $class;
101 }
102
103 my $parser = new XML::Parser('Style' => 'Stream',
104 'PluginObj' => $self,
105 'Handlers' => {'Char' => \&Char,
106 'Start' => \&OAI_StartTag,
107 'End' => \&OAI_EndTag
108 });
109 $self->{'parser'} = $parser;
110
111 # make sure the tmp directory that we will use later exists
112 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
113 if (! -e $tmp_dir) {
114 &FileUtils::makeDirectory($tmp_dir);
115 }
116
117 # if max_records not specified, parsing will have set it to ""
118 undef $self->{'max_records'} if $self->{'max_records'} eq "";
119
120 # set up hashmap for individual items in get_doc_exts
121 # to make testing for matches easier
122
123 $self->{'lookup_exts'} = {};
124 my $get_doc_exts = $self->{'get_doc_exts'};
125
126 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
127 my @exts = split(/,\s*/,$get_doc_exts);
128 foreach my $e (@exts) {
129 $self->{'lookup_exts'}->{lc($e)} = 1;
130 }
131 }
132
133
134 return bless $self, $class;
135}
136
137sub download
138{
139 my ($self) = shift (@_);
140 my ($hashGeneralOptions) = @_;
141
142## my $cmdWget = $strWgetOptions;
143
144 my $strOutputDir ="";
145 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
146 my $strBasURL = $self->{'url'};
147 my $blnDownloadDoc = $self->{'get_doc'};
148
149 print STDERR "<<Defined Maximum>>\n";
150
151 my $strIDs = $self->getOAIIDs($strBasURL);
152
153 if($strIDs eq "")
154 {
155 print STDERR "Error: No IDs found\n";
156 return 0;
157 }
158
159 my $aryIDs = $self->parseOAIIDs($strIDs);
160 my $intIDs = 0;
161 if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
162 {
163 $intIDs = $self->{'max_records'};
164 }
165 else
166 {
167 $intIDs = scalar(@$aryIDs);
168 }
169 print STDERR "<<Total number of record(s):$intIDs>>\n";
170
171 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
172
173# my $tmp_file = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"tmp","oai.tmp");
174# &FileUtils::removeFiles($tmp_file);
175
176 return 1;
177}
178
179sub getOAIIDs
180{
181 my ($self,$strBasURL) = @_;
182## my ($cmdWget);
183
184 my $wgetOptions = $self->getWgetOptions();
185
186 my $cmdWget = $wgetOptions;
187
188 print STDERR "Gathering OAI identifiers.....\n";
189
190 my $metadata_prefix = $self->{'metadata_prefix'};
191 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
192
193 # if $set specified, add it in to URL
194 my $set = $self->{'set'};
195 $cmdWget .= "&set=$set" if ($set ne "");
196
197 $cmdWget .= "\" ";
198
199 my $accumulated_strIDs = "";
200 my $strIDs = $self->useWget($cmdWget);
201
202 if (!defined $strIDs or $strIDs eq "" ){
203 print STDERR "Server information is unavailable.\n";
204 print STDERR "<<Finished>>\n";
205 return;
206 }
207 if ($self->{'forced_quit'}) {
208 return $strIDs;
209 }
210
211 print STDERR "<<Download Information>>\n";
212
213 $self->parse_xml($strIDs);
214
215 $accumulated_strIDs = $strIDs;
216 my $max_recs = $self->{'max_records'};
217 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
218 # top up list with further requests for IDs
219
220 my $resumption_token = $1;
221
222 $cmdWget = $wgetOptions;
223
224 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
225
226 $strIDs = $self->useWget($cmdWget);
227 if ($self->{'forced_quit'}) {
228 return $accumulated_strIDs;
229 }
230
231 $self->parse_xml($strIDs);
232
233 $accumulated_strIDs .= $strIDs;
234
235 my @accumulated_identifiers
236 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
237
238 my $num_acc_identifiers = scalar(@accumulated_identifiers);
239 if (defined $max_recs && $num_acc_identifiers > $max_recs ) {
240 last;
241 }
242 }
243
244 return $accumulated_strIDs;
245}
246
247sub parseOAIIDs
248{
249 my ($self,$strIDs) = @_;
250
251 print STDERR "Parsing OAI identifiers.....\n";
252 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
253 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
254
255 my @aryIDs = ();
256
257 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
258 {
259 $strIDs = $2;
260 push(@aryIDs,$1);
261 }
262
263 return \@aryIDs;
264}
265
266sub dirFileSplit
267{
268 my ($self,$strFile) = @_;
269
270 my @aryDirs = split("[/\]",$strFile);
271
272 my $strLocalFile = pop(@aryDirs);
273 my $strSubDirs = join("/",@aryDirs);
274
275 return ($strSubDirs,$strLocalFile);
276}
277
278sub getOAIDoc
279{
280 my ($self,$strRecord, $oai_rec_filename) = @_;
281
282 print STDERR "Gathering source documents.....\n";
283 # look out for identifier tag in metadata section
284
285 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
286 {
287 my $strMetaTag = $1;
288 my $had_valid_url = 0;
289 my $count = 1;
290 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
291 {
292 my $doc_id_url = $2;
293 print STDERR "Found doc url: $doc_id_url\n";
294 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
295
296 my $orig_doc_id_url = $doc_id_url;
297 $had_valid_url = 1;
298
299 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
300 my $faked_ext = 0;
301 my $primary_doc_match = 0;
302
303 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
304
305 if (defined $id_file_ext) {
306 # cross-check this filename extension with get_doc_exts option
307 # if provided
308 my $lookup_exts = $self->{'lookup_exts'};
309
310 if (defined $lookup_exts->{lc($id_file_ext)}) {
311 # this initial URL matches requirement
312 $primary_doc_match = 1;
313 }
314 }
315 else {
316 $faked_ext = 1;
317 $id_file_ext = "html";
318 }
319
320
321 my $is_page_html = 0;
322 if($id_file_ext =~ m/^html?$/i) {
323 $is_page_html = 1;
324 } elsif ($doc_id_url) { # important: if no doc id has currently been typed into the url field, skip this block
325
326 # get the page and check the header's content-type and see if this is text/html
327 # if so, $is_page_html is true
328 # See http://superuser.com/questions/197009/wget-head-request
329
330 my $wget_opts3 = $self->getWgetOptions();
331 my $wget_header_cmd = "$wget_opts3 -S --spider \"$doc_id_url\"";
332 my $page_content = $self->useWget($wget_header_cmd);
333
334 if($page_content && $page_content =~ m@Content-Type:\s*text/html@i) {
335 $is_page_html = 1;
336 }
337 }
338
339 if (!$primary_doc_match && $is_page_html) {
340
341 # Download this doc if HTML, scan through it looking for a link
342 # that does match get_doc_exts
343
344
345 # 1. Generate a tmp name
346 my $tmp_filename = &util::get_tmp_filename();
347
348 # 2. Download it
349 my $wget_opts2 = $self->getWgetOptions();
350 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
351 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
352 return $strRecord if $self->{'forced_quit'};
353
354 if($error2 ne "")
355 {
356 print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
357 #exit(-1);
358 next;
359 }
360
361 if (defined $follow2) {
362 # src url was "redirected" to another place
363 # => pick up on this and make it the new doc_id_url
364 $doc_id_url = $follow2;
365 }
366
367 my $primary_doc_html = "";
368 if (open(HIN,"<$tmp_filename")) {
369 my $line;
370 while (defined ($line = <HIN>)) {
371 $primary_doc_html .= $line;
372 }
373 close(HIN);
374
375 # 3. Scan through it looking for match
376 #
377 # if got match, change $doc_id_url to this new URL and
378 # $id_file_ext to 'match'
379
380 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
381
382 my $lookup_exts = $self->{'lookup_exts'};
383
384 foreach my $href (@href_links) {
385 my ($ext) = ($href =~ m/\.([^\.]+)$/);
386
387 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
388
389 if ($href !~ m/^(https?|ftp):\/\//) {
390 # link is within current site
391 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
392
393 $href = "$site_domain$href";
394 }
395
396 $doc_id_url = $href;
397 $id_file_ext = $ext;
398 last;
399 }
400 }
401 }
402 else {
403 print STDERR "Error occurred while retrieving OAI source documents (2):\n";
404 print STDERR "$!\n";
405 }
406
407 if (-e $tmp_filename) {
408 &FileUtils::removeFiles($tmp_filename);
409 }
410 }
411
412 my $download_doc_filename = $oai_rec_filename;
413 # As requested by John Rose, don't suffix -$count for the very first document (so no "-1" suffix):
414 my $new_extension = ($count > 1) ? "\-$count\.$id_file_ext" : "\.$id_file_ext";
415 $count++;
416 #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
417 $download_doc_filename =~ s/\.oai$/$new_extension/;
418 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
419
420 # may have &apos; in url - others??
421 my $safe_doc_id_url = $doc_id_url;
422 $safe_doc_id_url =~ s/&apos;/\'/g;
423
424 my $wget_opts = $self->getWgetOptions();
425 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
426
427 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
428 return $strRecord if $self->{'forced_quit'};
429
430 if($errors ne "")
431 {
432 print STDERR "Error occured while retriving OAI souce documents (3):\n";
433 print STDERR "$errors\n";
434 #exit(-1);
435 next;
436 }
437
438
439 $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
440 }
441
442 if (!$had_valid_url)
443 {
444 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
445 }
446 }
447 else
448 {
449 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
450 }
451
452 return $strRecord;
453}
454
455sub getOAIRecords
456{
457 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
458 my $intDocCounter = 0;
459
460 my $metadata_prefix = $self->{'metadata_prefix'};
461
462 foreach my $strID ( @$aryIDs)
463 {
464 print STDERR "Gathering OAI record with ID $strID.....\n";
465
466 my $wget_opts = $self->getWgetOptions();
467 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
468
469 my $strRecord = $self->useWget($cmdWget);
470
471 my @fileDirs = split(":",$strID);
472 my $local_id = pop @fileDirs;
473
474 # setup directories
475
476 $strOutputDir =~ s/"//g; #"
477
478 my $host =$self->{'url'};
479
480 $host =~ s@https?:\/\/@@g;
481
482 $host =~ s/:.*//g;
483
484 my $strFileURL = "";
485 if ($strOutputDir ne "") {
486 $strFileURL = "$strOutputDir/";
487 }
488 $strFileURL .= "$host/$local_id.oai";
489
490 # prepare subdirectory for record (if needed)
491 my ($strSubDirPath,$unused) = ("", "");
492
493 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
494
495 &FileUtils::makeAllDirectories($strSubDirPath);
496
497 my $ds = &util::get_dirsep();
498
499 if($blnDownloadDoc)
500 {
501 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
502 }
503
504 # save record
505 open (OAIOUT,">$strFileURL")
506 || die "Unable to save oai metadata record: $!\n";
507 print OAIOUT $strRecord;
508 close(OAIOUT);
509
510 print STDERR "Saving record to $strFileURL\n";
511 print STDERR "<<Done>>\n";
512 $intDocCounter ++;
513 last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
514 }
515
516 (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
517 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
518 print STDERR "Complete download meta record from $strBasURL\n";
519
520 print STDERR "<<Finished>>\n";
521}
522
523sub url_information
524{
525 my ($self) = shift (@_);
526 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
527
528 my $wgetOptions = $self->getWgetOptions();
529 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
530
531 my $strIdentify = "verb=Identify";
532 my $strListSets = "verb=ListSets";
533 my $strListMdFormats = "verb=ListMetadataFormats";
534
535 my $strIdentifyCMD = $strBaseCMD;
536 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
537
538 my $strIdentifyText = $self->useWget($strIdentifyCMD);
539
540 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
541 print STDERR "Server information is unavailable.\n";
542 print STDERR "<<Finished>>\n";
543 return;
544 }
545
546 print STDERR "General information:\n";
547 $self->parse_xml($strIdentifyText);
548 print STDERR "\n";
549
550 print STDERR "=" x 10, "\n";
551 print STDERR "Metadata Format Information (metadataPrefix):\n";
552 print STDERR "=" x 10, "\n";
553
554 my $strListMdFormatsCMD = $strBaseCMD;
555 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
556 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
557
558 $self->parse_xml($strListMdFormatsText);
559 print STDERR "\n";
560
561 print STDERR "=" x 10, "\n";
562 print STDERR "List Information:\n";
563 print STDERR "=" x 10, "\n";
564
565 my $strListSetCMD = $strBaseCMD;
566 $strListSetCMD =~ s/_OPTS_/$strListSets/;
567 my $strListSetsText = $self->useWget($strListSetCMD);
568
569 $self->parse_xml($strListSetsText);
570}
571
572sub parse_xml
573{
574 my ($self) = shift (@_);
575 my ($xml_text) = @_;
576
577 #### change this to work directly from $xml_text
578
579 #Open a temporary file to store OAI information, and store the information to the temp file
580 my $name = &FileUtils::filenameConcatenate($ENV{GSDLHOME},"tmp","oai.tmp");
581
582 open(*OAIOUT,"> $name");
583
584 print OAIOUT $xml_text;
585 close(OAIOUT);
586
587 $self->{'temp_file_name'} = $name;
588
589## print STDERR "**** xml text = $xml_text\n";
590
591 eval {
592 $self->{'parser'}->parsefile("$name");
593## $self->{'parser'}->parse($xml_text);
594 };
595
596 if ($@) {
597 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
598## die "OAI: Parsed text is not a well formed XML file ($@)\n";
599 }
600
601 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
602}
603
604####END
605#{
606# if($self->{'info'})
607# {
608# unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
609# }
610#}
611
612# This Char function overrides the one in XML::Parser::Stream to overcome a
613# problem where $expat->{Text} is treated as the return value, slowing
614# things down significantly in some cases.
615sub Char {
616 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
617 $_[0]->{'Text'} .= $_[1];
618
619 my $self = $_[0]->{'PluginObj'};
620 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
621 $self->{'text'} .= $_[1];
622 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
623 if($self->{'text'} ne "")
624 {
625 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
626 }
627 }
628 return undef;
629}
630
631sub OAI_StartTag
632{
633 my ($expat, $element, %attr) = @_;
634
635 my $self = $expat->{'PluginObj'};
636 $self->{'subfield'} = $element;
637
638}
639
640sub OAI_EndTag
641{
642 my ($expat, $element) = @_;
643
644 my $self = $expat->{'PluginObj'};
645 $self->{'text'} = "";
646 $self->{'subfield'} = "";
647}
648
649sub error
650{
651 my ($self,$strFunctionName,$strError) = @_;
652 {
653 print "Error occoured in OAIDownload.pm\n".
654 "In Function:".$strFunctionName."\n".
655 "Error Message:".$strError."\n";
656 exit(-1);
657 }
658}
659
660
661
6621;
Note: See TracBrowser for help on using the repository browser.