source: main/trunk/greenstone2/perllib/downloaders/OAIDownload.pm@ 27012

Last change on this file since 27012 was 27012, checked in by ak19, 11 years ago

As John Rose requested: when you build an oai collection with the Create panel's Import Options' OIDType set to assigned and its associated OAImetadata field set to ex.File.FileName and build the collection, a client can then use OAIDownload to Get Documents from this collection over OAI that have similarly readable filenames, with an additional number suffixed to the filenames. John Rose requested that the first file of such a name not have the hyphen 1 (-1) suffixed.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.8 KB
Line 
1###########################################################################
2#
3# OAIDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
64 'disp' => "{OAIDownload.get_doc_disp}",
65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
68 { 'name' => "get_doc_exts",
69 'disp' => "{OAIDownload.get_doc_exts_disp}",
70 'desc' => "{OAIDownload.get_doc_exts}",
71 'type' => "string",
72 'deft' => "doc,pdf,ppt",
73 'reqd' => "no"},
74 { 'name' => "max_records",
75 'disp' => "{OAIDownload.max_records_disp}",
76 'desc' => "{OAIDownload.max_records}",
77 'type' => "int",
78 'range' => "1,",
79 'reqd' => "no"} ];
80
81my $options = { 'name' => "OAIDownload",
82 'desc' => "{OAIDownload.desc}",
83 'abstract' => "no",
84 'inherits' => "yes",
85 'args' => $arguments };
86
87sub new
88{
89 my ($class) = shift (@_);
90 my ($getlist,$inputargs,$hashArgOptLists) = @_;
91 push(@$getlist, $class);
92
93 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
94 push(@{$hashArgOptLists->{"OptList"}},$options);
95
96 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
97
98 if ($self->{'info_only'}) {
99 # don't worry about any options etc
100 return bless $self, $class;
101 }
102
103 my $parser = new XML::Parser('Style' => 'Stream',
104 'PluginObj' => $self,
105 'Handlers' => {'Char' => \&Char,
106 'Start' => \&OAI_StartTag,
107 'End' => \&OAI_EndTag
108 });
109 $self->{'parser'} = $parser;
110
111 # make sure the tmp directory that we will use later exists
112 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
113 if (! -e $tmp_dir) {
114 &util::mk_dir($tmp_dir);
115 }
116
117 # if max_records not specified, parsing will have set it to ""
118 undef $self->{'max_records'} if $self->{'max_records'} eq "";
119
120 # set up hashmap for individual items in get_doc_exts
121 # to make testing for matches easier
122
123 $self->{'lookup_exts'} = {};
124 my $get_doc_exts = $self->{'get_doc_exts'};
125
126 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
127 my @exts = split(/,\s*/,$get_doc_exts);
128 foreach my $e (@exts) {
129 $self->{'lookup_exts'}->{lc($e)} = 1;
130 }
131 }
132
133
134 return bless $self, $class;
135}
136
137sub download
138{
139 my ($self) = shift (@_);
140 my ($hashGeneralOptions) = @_;
141
142## my $cmdWget = $strWgetOptions;
143
144 my $strOutputDir ="";
145 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
146 my $strBasURL = $self->{'url'};
147 my $blnDownloadDoc = $self->{'get_doc'};
148
149 print STDERR "<<Defined Maximum>>\n";
150
151 my $strIDs = $self->getOAIIDs($strBasURL);
152
153 if($strIDs eq "")
154 {
155 print STDERR "Error: No IDs found\n";
156 return 0;
157 }
158
159 my $aryIDs = $self->parseOAIIDs($strIDs);
160 my $intIDs = 0;
161 if(defined $self->{'max_records'} && $self->{'max_records'} < scalar(@$aryIDs))
162 {
163 $intIDs = $self->{'max_records'};
164 }
165 else
166 {
167 $intIDs = scalar(@$aryIDs);
168 }
169 print STDERR "<<Total number of record(s):$intIDs>>\n";
170
171 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $self->{'max_records'}, $blnDownloadDoc);
172
173# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
174# &util::rm($tmp_file);
175
176 return 1;
177}
178
179sub getOAIIDs
180{
181 my ($self,$strBasURL) = @_;
182## my ($cmdWget);
183
184 my $wgetOptions = $self->getWgetOptions();
185
186 my $cmdWget = $wgetOptions;
187
188 print STDERR "Gathering OAI identifiers.....\n";
189
190 my $metadata_prefix = $self->{'metadata_prefix'};
191 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
192
193 # if $set specified, add it in to URL
194 my $set = $self->{'set'};
195 $cmdWget .= "&set=$set" if ($set ne "");
196
197 $cmdWget .= "\" ";
198
199 my $accumulated_strIDs = "";
200 my $strIDs = $self->useWget($cmdWget);
201
202 if (!defined $strIDs or $strIDs eq "" ){
203 print STDERR "Server information is unavailable.\n";
204 print STDERR "<<Finished>>\n";
205 return;
206 }
207 if ($self->{'forced_quit'}) {
208 return $strIDs;
209 }
210
211 print STDERR "<<Download Information>>\n";
212
213 $self->parse_xml($strIDs);
214
215 $accumulated_strIDs = $strIDs;
216 my $max_recs = $self->{'max_records'};
217 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/s) {
218 # top up list with further requests for IDs
219
220 my $resumption_token = $1;
221
222 $cmdWget = $wgetOptions;
223
224 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
225
226 $strIDs = $self->useWget($cmdWget);
227 if ($self->{'forced_quit'}) {
228 return $accumulated_strIDs;
229 }
230
231 $self->parse_xml($strIDs);
232
233 $accumulated_strIDs .= $strIDs;
234
235 my @accumulated_identifiers
236 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
237
238 my $num_acc_identifiers = scalar(@accumulated_identifiers);
239 if (defined $max_recs && $num_acc_identifiers > $max_recs ) {
240 last;
241 }
242 }
243
244 return $accumulated_strIDs;
245}
246
247sub parseOAIIDs
248{
249 my ($self,$strIDs) = @_;
250
251 print STDERR "Parsing OAI identifiers.....\n";
252 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
253 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
254
255 my @aryIDs = ();
256
257 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
258 {
259 $strIDs = $2;
260 push(@aryIDs,$1);
261 }
262
263 return \@aryIDs;
264}
265
266sub dirFileSplit
267{
268 my ($self,$strFile) = @_;
269
270 my @aryDirs = split("[/\]",$strFile);
271
272 my $strLocalFile = pop(@aryDirs);
273 my $strSubDirs = join("/",@aryDirs);
274
275 return ($strSubDirs,$strLocalFile);
276}
277
278sub getOAIDoc
279{
280 my ($self,$strRecord, $oai_rec_filename) = @_;
281
282 print STDERR "Gathering source documents.....\n";
283 # look out for identifier tag in metadata section
284
285 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
286 {
287 my $strMetaTag = $1;
288 my $had_valid_url = 0;
289 my $count = 1;
290 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
291 {
292 my $doc_id_url = $2;
293 print STDERR "Found doc url: $doc_id_url\n";
294 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
295
296 my $orig_doc_id_url = $doc_id_url;
297 $had_valid_url = 1;
298
299 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
300 my $faked_ext = 0;
301 my $primary_doc_match = 0;
302
303 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
304
305 if (defined $id_file_ext) {
306 # cross-check this filename extension with get_doc_exts option
307 # if provided
308 my $lookup_exts = $self->{'lookup_exts'};
309
310 if (defined $lookup_exts->{lc($id_file_ext)}) {
311 # this initial URL matches requirement
312 $primary_doc_match = 1;
313 }
314 }
315 else {
316 $faked_ext = 1;
317 $id_file_ext = "html";
318 }
319
320
321 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
322 # Download this doc if HTML, scan through it looking for a link
323 # that does match get_doc_exts
324
325
326 # 1. Generate a tmp name
327 my $tmp_filename = &util::get_tmp_filename();
328
329 # 2. Download it
330 my $wget_opts2 = $self->getWgetOptions();
331 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
332 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
333 return $strRecord if $self->{'forced_quit'};
334
335 if($error2 ne "")
336 {
337 print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
338 #exit(-1);
339 next;
340 }
341
342 if (defined $follow2) {
343 # src url was "redirected" to another place
344 # => pick up on this and make it the new doc_id_url
345 $doc_id_url = $follow2;
346 }
347
348 my $primary_doc_html = "";
349 if (open(HIN,"<$tmp_filename")) {
350 my $line;
351 while (defined ($line = <HIN>)) {
352 $primary_doc_html .= $line;
353 }
354 close(HIN);
355
356 # 3. Scan through it looking for match
357 #
358 # if got match, change $doc_id_url to this new URL and
359 # $id_file_ext to 'match'
360
361 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
362
363 my $lookup_exts = $self->{'lookup_exts'};
364
365 foreach my $href (@href_links) {
366 my ($ext) = ($href =~ m/\.([^\.]+)$/);
367
368 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
369
370 if ($href !~ m/^(https?|ftp):\/\//) {
371 # link is within current site
372 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
373
374 $href = "$site_domain$href";
375 }
376
377 $doc_id_url = $href;
378 $id_file_ext = $ext;
379 last;
380 }
381 }
382 }
383 else {
384 print STDERR "Error occurred while retrieving OAI source documents (2):\n";
385 print STDERR "$!\n";
386 }
387
388 if (-e $tmp_filename) {
389 &util::rm($tmp_filename);
390 }
391 }
392
393 my $download_doc_filename = $oai_rec_filename;
394 # As requested by John Rose, don't suffix -$count for the very first document (so no "-1" suffix):
395 my $new_extension = ($count > 1) ? "\-$count\.$id_file_ext" : "\.$id_file_ext";
396 $count++;
397 #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
398 $download_doc_filename =~ s/\.oai$/$new_extension/;
399 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
400
401 # may have &apos; in url - others??
402 my $safe_doc_id_url = $doc_id_url;
403 $safe_doc_id_url =~ s/&apos;/\'/g;
404
405 my $wget_opts = $self->getWgetOptions();
406 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
407
408 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
409 return $strRecord if $self->{'forced_quit'};
410
411 if($errors ne "")
412 {
413 print STDERR "Error occured while retriving OAI souce documents (3):\n";
414 print STDERR "$errors\n";
415 #exit(-1);
416 next;
417 }
418
419
420 $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
421 }
422
423 if (!$had_valid_url)
424 {
425 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
426 }
427 }
428 else
429 {
430 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
431 }
432
433 return $strRecord;
434}
435
436sub getOAIRecords
437{
438 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
439 my $intDocCounter = 0;
440
441 my $metadata_prefix = $self->{'metadata_prefix'};
442
443 foreach my $strID ( @$aryIDs)
444 {
445 print STDERR "Gathering OAI record with ID $strID.....\n";
446
447 my $wget_opts = $self->getWgetOptions();
448 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
449
450 my $strRecord = $self->useWget($cmdWget);
451
452 my @fileDirs = split(":",$strID);
453 my $local_id = pop @fileDirs;
454
455 # setup directories
456
457 $strOutputDir =~ s/"//g; #"
458
459 my $host =$self->{'url'};
460
461 $host =~ s@https?:\/\/@@g;
462
463 $host =~ s/:.*//g;
464
465 my $strFileURL = "";
466 if ($strOutputDir ne "") {
467 $strFileURL = "$strOutputDir/";
468 }
469 $strFileURL .= "$host/$local_id.oai";
470
471 # prepare subdirectory for record (if needed)
472 my ($strSubDirPath,$unused) = ("", "");
473
474 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
475
476 &util::mk_all_dir($strSubDirPath);
477
478 my $ds = &util::get_dirsep();
479
480 if($blnDownloadDoc)
481 {
482 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
483 }
484
485 # save record
486 open (OAIOUT,">$strFileURL")
487 || die "Unable to save oai metadata record: $!\n";
488 print OAIOUT $strRecord;
489 close(OAIOUT);
490
491 print STDERR "Saving record to $strFileURL\n";
492 print STDERR "<<Done>>\n";
493 $intDocCounter ++;
494 last if (defined $intMaxRecords && $intDocCounter >= $intMaxRecords);
495 }
496
497 (defined $intMaxRecords && $intDocCounter >= $intMaxRecords) ?
498 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
499 print STDERR "Complete download meta record from $strBasURL\n";
500
501 print STDERR "<<Finished>>\n";
502}
503
504sub url_information
505{
506 my ($self) = shift (@_);
507 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
508
509 my $wgetOptions = $self->getWgetOptions();
510 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
511
512 my $strIdentify = "verb=Identify";
513 my $strListSets = "verb=ListSets";
514 my $strListMdFormats = "verb=ListMetadataFormats";
515
516 my $strIdentifyCMD = $strBaseCMD;
517 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
518
519 my $strIdentifyText = $self->useWget($strIdentifyCMD);
520
521 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
522 print STDERR "Server information is unavailable.\n";
523 print STDERR "<<Finished>>\n";
524 return;
525 }
526
527 print STDERR "General information:\n";
528 $self->parse_xml($strIdentifyText);
529 print STDERR "\n";
530
531 print STDERR "=" x 10, "\n";
532 print STDERR "Metadata Format Information (metadataPrefix):\n";
533 print STDERR "=" x 10, "\n";
534
535 my $strListMdFormatsCMD = $strBaseCMD;
536 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
537 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
538
539 $self->parse_xml($strListMdFormatsText);
540 print STDERR "\n";
541
542 print STDERR "=" x 10, "\n";
543 print STDERR "List Information:\n";
544 print STDERR "=" x 10, "\n";
545
546 my $strListSetCMD = $strBaseCMD;
547 $strListSetCMD =~ s/_OPTS_/$strListSets/;
548 my $strListSetsText = $self->useWget($strListSetCMD);
549
550 $self->parse_xml($strListSetsText);
551}
552
553sub parse_xml
554{
555 my ($self) = shift (@_);
556 my ($xml_text) = @_;
557
558 #### change this to work directly from $xml_text
559
560 #Open a temporary file to store OAI information, and store the information to the temp file
561 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
562
563 open(*OAIOUT,"> $name");
564
565 print OAIOUT $xml_text;
566 close(OAIOUT);
567
568 $self->{'temp_file_name'} = $name;
569
570## print STDERR "**** xml text = $xml_text\n";
571
572 eval {
573 $self->{'parser'}->parsefile("$name");
574## $self->{'parser'}->parse($xml_text);
575 };
576
577 if ($@) {
578 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
579## die "OAI: Parsed text is not a well formed XML file ($@)\n";
580 }
581
582 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
583}
584
585####END
586#{
587# if($self->{'info'})
588# {
589# unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
590# }
591#}
592
593# This Char function overrides the one in XML::Parser::Stream to overcome a
594# problem where $expat->{Text} is treated as the return value, slowing
595# things down significantly in some cases.
596sub Char {
597 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
598 $_[0]->{'Text'} .= $_[1];
599
600 my $self = $_[0]->{'PluginObj'};
601 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
602 $self->{'text'} .= $_[1];
603 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
604 if($self->{'text'} ne "")
605 {
606 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
607 }
608 }
609 return undef;
610}
611
612sub OAI_StartTag
613{
614 my ($expat, $element, %attr) = @_;
615
616 my $self = $expat->{'PluginObj'};
617 $self->{'subfield'} = $element;
618
619}
620
621sub OAI_EndTag
622{
623 my ($expat, $element) = @_;
624
625 my $self = $expat->{'PluginObj'};
626 $self->{'text'} = "";
627 $self->{'subfield'} = "";
628}
629
630sub error
631{
632 my ($self,$strFunctionName,$strError) = @_;
633 {
634 print "Error occoured in OAIDownload.pm\n".
635 "In Function:".$strFunctionName."\n".
636 "Error Message:".$strError."\n";
637 exit(-1);
638 }
639}
640
641
642
6431;
Note: See TracBrowser for help on using the repository browser.