source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 17668

Last change on this file since 17668 was 17668, checked in by kjdon, 15 years ago

added a counter into the filenames for downloaded documents - sometimes thee may be two with the same file extension so we need to distinguish between them

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
RevLine 
[11783]1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
[12465]40use util;
[11783]41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
[14941]52 { 'name' => "metadata_prefix",
53 'disp' => "{OAIDownload.metadata_prefix_disp}",
54 'desc' => "{OAIDownload.metadata_prefix}",
55 'type' => "string",
56 'deft' => "oai_dc",
57 'reqd' => "no"},
[11783]58 { 'name' => "set",
59 'disp' => "{OAIDownload.set_disp}",
60 'desc' => "{OAIDownload.set}",
61 'type' => "string",
62 'reqd' => "no"},
63 { 'name' => "get_doc",
[12465]64 'disp' => "{OAIDownload.get_doc_disp}",
[11783]65 'desc' => "{OAIDownload.get_doc}",
66 'type' => "flag",
67 'reqd' => "no"},
[16791]68 { 'name' => "get_doc_exts",
69 'disp' => "{OAIDownload.get_doc_exts_disp}",
70 'desc' => "{OAIDownload.get_doc_exts}",
71 'type' => "string",
72 'deft' => "doc,pdf,ppt",
73 'reqd' => "no"},
[11783]74 { 'name' => "max_records",
75 'disp' => "{OAIDownload.max_records_disp}",
76 'desc' => "{OAIDownload.max_records}",
77 'type' => "int",
78 'deft' => "500",
79 'range' => "1,",
80 'reqd' => "no"} ];
81
82my $options = { 'name' => "OAIDownload",
83 'desc' => "{OAIDownload.desc}",
84 'abstract' => "no",
85 'inherits' => "yes",
86 'args' => $arguments };
87
[16791]88##my $self;
[11783]89
[16791]90#### my $strWgetOptions="";
[12465]91
[11783]92sub new
93{
94 my ($class) = shift (@_);
95 my ($getlist,$inputargs,$hashArgOptLists) = @_;
96 push(@$getlist, $class);
97
[17207]98 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
99 push(@{$hashArgOptLists->{"OptList"}},$options);
[11783]100
[17207]101 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
[11783]102
103 if ($self->{'info_only'}) {
104 # don't worry about any options etc
105 return bless $self, $class;
106 }
107
108 my $parser = new XML::Parser('Style' => 'Stream',
[16791]109 'PluginObj' => $self,
[11783]110 'Handlers' => {'Char' => \&Char,
111 'Start' => \&OAI_StartTag,
112 'End' => \&OAI_EndTag
113 });
114 $self->{'parser'} = $parser;
115
[13961]116 # make sure the tmp directory that we will use later exists
117 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
118 if (! -e $tmp_dir) {
119 &util::mk_dir($tmp_dir);
120 }
121
[16791]122
123 # set up hashmap for individual items in get_doc_exts
124 # to make testing for matches easier
125
126 $self->{'lookup_exts'} = {};
127 my $get_doc_exts = $self->{'get_doc_exts'};
128
129 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
130 my @exts = split(/,\s*/,$get_doc_exts);
131 foreach my $e (@exts) {
132 $self->{'lookup_exts'}->{lc($e)} = 1;
133 }
134 }
135
136
[11783]137 return bless $self, $class;
138}
139
140sub download
141{
142 my ($self) = shift (@_);
143 my ($hashGeneralOptions) = @_;
144
[16791]145## my $cmdWget = $strWgetOptions;
[12465]146
147 my $strOutputDir ="";
148 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
[11783]149 my $strBasURL = $self->{'url'};
150 my $intMaxRecords = $self->{'max_records'};
151 my $blnDownloadDoc = $self->{'get_doc'};
152
153 print STDERR "<<Defined Maximum>>\n";
[12465]154
155 my $strIDs = $self->getOAIIDs($strBasURL);
156
[16791]157 if($strIDs eq "")
[11783]158 {
[16791]159 print STDERR "Error: No IDs found\n";
[11783]160 return 0;
161 }
[16791]162
[12465]163 my $aryIDs = $self->parseOAIIDs($strIDs);
[11783]164 my $intIDs = 0;
165 if($self->{'max_records'} < scalar(@$aryIDs))
166 {
167 $intIDs = $self->{'max_records'};
168 }
169 else
170 {
171 $intIDs = scalar(@$aryIDs);
172 }
173 print STDERR "<<Total number of record(s):$intIDs>>\n";
174
[12465]175 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
[11783]176
[16791]177# my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
178# &util::rm($tmp_file);
[12465]179
[11783]180 return 1;
181}
182
183sub getOAIIDs
184{
185 my ($self,$strBasURL) = @_;
[16791]186## my ($cmdWget);
[12465]187
188 my $wgetOptions = $self->getWgetOptions();
189
[16791]190 my $cmdWget = $wgetOptions;
[12465]191
[11783]192 print STDERR "Gathering OAI identifiers.....\n";
[12465]193
[14941]194 my $metadata_prefix = $self->{'metadata_prefix'};
195 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=$metadata_prefix";
[12465]196
[14941]197 # if $set specified, add it in to URL
198 my $set = $self->{'set'};
199 $cmdWget .= "&set=$set" if ($set ne "");
200
201 $cmdWget .= "\" ";
202
[14948]203 my $accumulated_strIDs = "";
[12465]204 my $strIDs = $self->useWget($cmdWget);
205
206 if (!defined $strIDs or $strIDs eq "" ){
207 print STDERR "Server information is unavailable.\n";
208 print STDERR "<<Finished>>\n";
209 return;
210 }
[17549]211 if ($self->{'forced_quit'}) {
212 return $strIDs;
213 }
[12465]214
215 print STDERR "<<Download Information>>\n";
[14948]216
[12465]217 $self->parse_xml($strIDs);
218
[14948]219 $accumulated_strIDs = $strIDs;
220
[16704]221 while ($strIDs =~ m/<resumptionToken.*?>\s*(.*?)\s*<\/resumptionToken>/) {
[14948]222 # top up list with further requests for IDs
223
224 my $resumption_token = $1;
225
226 $cmdWget = $wgetOptions;
227
[16725]228 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&resumptionToken=$resumption_token\"";
[14948]229
230 $strIDs = $self->useWget($cmdWget);
[17549]231 if ($self->{'forced_quit'}) {
232 return $accumulated_strIDs;
233 }
[14948]234
235 $self->parse_xml($strIDs);
236
237 $accumulated_strIDs .= $strIDs;
238
239 my @accumulated_identifiers
240 = ($accumulated_strIDs =~ m/<identifier>(.*?)<\/identifier>/sg);
241
242 my $num_acc_identifiers = scalar(@accumulated_identifiers);
243 if ($num_acc_identifiers > $self->{'max_records'}) {
244 last;
245 }
246 }
247
248 return $accumulated_strIDs;
[11783]249}
250
251sub parseOAIIDs
252{
[12465]253 my ($self,$strIDs) = @_;
[11783]254
255 print STDERR "Parsing OAI identifiers.....\n";
256 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
257 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
258
259 my @aryIDs = ();
260
261 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
262 {
263 $strIDs = $2;
264 push(@aryIDs,$1);
265 }
266
267 return \@aryIDs;
268}
269
270sub dirFileSplit
271{
[12465]272 my ($self,$strFile) = @_;
[11783]273
[12465]274 my @aryDirs = split("[/\]",$strFile);
275
[11783]276 my $strLocalFile = pop(@aryDirs);
277 my $strSubDirs = join("/",@aryDirs);
278
279 return ($strSubDirs,$strLocalFile);
280}
281
282sub getOAIDoc
283{
[16791]284 my ($self,$strRecord, $oai_rec_filename) = @_;
[12465]285
[11783]286 print STDERR "Gathering source documents.....\n";
287 # look out for identifier tag in metadata section
[12465]288
[11783]289 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
290 {
291 my $strMetaTag = $1;
[16791]292 my $had_valid_url = 0;
[17668]293 my $count = 1;
[16791]294 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
[11783]295 {
[16791]296 my $doc_id_url = $2;
[11783]297
[16791]298 next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
[11783]299
[16791]300 my $orig_doc_id_url = $doc_id_url;
301 $had_valid_url = 1;
[12465]302
[16791]303 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
304 my $faked_ext = 0;
305 my $primary_doc_match = 0;
[12465]306
[16791]307 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
[11783]308
[16791]309 if (defined $id_file_ext) {
310 # cross-check this filename extension with get_doc_exts option
311 # if provided
312 my $lookup_exts = $self->{'lookup_exts'};
[11783]313
[16791]314 if (defined $lookup_exts->{lc($id_file_ext)}) {
315 # this initial URL matches requirement
316 $primary_doc_match = 1;
317 }
318 }
319 else {
320 $faked_ext = 1;
321 $id_file_ext = "html";
322 }
323
324
325 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
326 # Download this doc if HTML, scan through it looking for a link
327 # that does match get_doc_exts
328
329
330 # 1. Generate a tmp name
331 my $tmp_filename = &util::get_tmp_filename();
332
333 # 2. Download it
334 my $wget_opts2 = $self->getWgetOptions();
335 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
336
337 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2);
[17549]338 return $strRecord if $self->{'forced_quit'};
[16791]339
340 if($error2 ne "")
341 {
342 print STDERR "Error occured while retrieving OAI source documents: $error2\n";
343 exit(-1);
344 }
345
346 if (defined $follow2) {
347 # src url was "redirected" to another place
348 # => pick up on this and make it the new doc_id_url
349 $doc_id_url = $follow2;
350 }
351
352 my $primary_doc_html = "";
353 if (open(HIN,"<$tmp_filename")) {
354 my $line;
355 while (defined ($line = <HIN>)) {
356 $primary_doc_html .= $line;
357 }
358 close(HIN);
359
360 # 3. Scan through it looking for match
361 #
362 # if got match, change $doc_id_url to this new URL and
363 # $id_file_ext to 'match'
364
365 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
366
367 my $lookup_exts = $self->{'lookup_exts'};
368
369 foreach my $href (@href_links) {
370 my ($ext) = ($href =~ m/\.([^\.]+)$/);
371
372 if ((defined $ext) && (defined $lookup_exts->{$ext})) {
373
374 if ($href !~ m/^(https?|ftp):\/\//) {
375 # link is within current site
376 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
377
378 $href = "$site_domain$href";
379 }
380
381 $doc_id_url = $href;
382 $id_file_ext = $ext;
383 last;
384 }
385 }
386 }
387 else {
388 print STDERR "Error occurred while retrieving OAI source documents:\n";
389 print STDERR "$!\n";
390 }
391
392 if (-e $tmp_filename) {
393 &util::rm($tmp_filename);
394 }
395 }
396
397 my $download_doc_filename = $oai_rec_filename;
[17668]398 my $new_extension = "\-$count\.$id_file_ext";
399 $count++;
400 #$download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
401 $download_doc_filename =~ s/\.oai$/$new_extension/;
[16791]402 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
403
404 my $wget_opts = $self->getWgetOptions();
405 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\"";
406
407 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd);
[17549]408 return $strRecord if $self->{'forced_quit'};
[16791]409
410 if($errors ne "")
[11783]411 {
[16791]412 print STDERR "Error occured while retriving OAI souce documents:\n";
413 print STDERR "$errors\n";
[11783]414 exit(-1);
415 }
416
[16791]417
418 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
[11783]419 }
[16791]420
421 if (!$had_valid_url)
[11783]422 {
[17549]423 print STDERR "\tNo source document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
[11783]424 }
425 }
426 else
427 {
[17549]428 print STDERR "\tNo source document URL is specified in the OAI record (No metadata field is provided)\n";
[11783]429 }
[16791]430
431 return $strRecord;
[11783]432}
433
434sub getOAIRecords
435{
[12465]436 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
[11783]437
438 my $intDocCounter = 0;
439
[14941]440 my $metadata_prefix = $self->{'metadata_prefix'};
441
[11783]442 foreach my $strID ( @$aryIDs)
443 {
[16791]444 print STDERR "Gathering OAI record with ID $strID.....\n";
[12465]445
[16791]446 my $wget_opts = $self->getWgetOptions();
447 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
[11783]448
[12465]449 my $strRecord = $self->useWget($cmdWget);
450
[16791]451 my @fileDirs = split(":",$strID);
452 my $local_id = pop @fileDirs;
[12465]453
[11783]454 # setup directories
455
[12580]456 $strOutputDir =~ s/"//g; #"
[13065]457
458 my $host =$self->{'url'};
[12465]459
[16791]460 $host =~ s/https?:\/\///g;
[13065]461
462 $host =~ s/:.*//g;
463
[16791]464 my $strFileURL = "$strOutputDir/$host/$local_id.oai";
[14179]465
[16791]466
[11783]467 # prepare subdirectory for record (if needed)
[12465]468 my ($strSubDirPath,$unused) = ("", "");
469
470 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
471
[11783]472 &util::mk_all_dir($strSubDirPath);
473
474 my $ds = &util::get_dirsep();
[12465]475
[11783]476 if($blnDownloadDoc)
477 {
[16791]478 $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
[11783]479 }
480
481 # save record
[12465]482 open (OAIOUT,">$strFileURL")
[11783]483 || die "Unable to save oai metadata record: $!\n";
484 print OAIOUT $strRecord;
485 close(OAIOUT);
486
[12465]487 print STDERR "Saving records to $strFileURL\n";
488 print STDERR "<<Done>>\n";
489 $intDocCounter ++;
[11783]490 last if ($intDocCounter >= $intMaxRecords);
491 }
[12465]492
[11783]493 ($intDocCounter >= $intMaxRecords) ?
[14926]494 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
[11783]495 print STDERR "Complete download meta record from $strBasURL\n";
496
[12465]497 print STDERR "<<Finished>>\n";
[11783]498}
499
500sub url_information
501{
502 my ($self) = shift (@_);
503 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
504
[12465]505 my $wgetOptions = $self->getWgetOptions();
506 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
[11783]507
508 my $strIdentify = "verb=Identify";
509 my $strListSets = "verb=ListSets";
[16791]510 my $strListMdFormats = "verb=ListMetadataFormats";
[11783]511
512 my $strIdentifyCMD = $strBaseCMD;
513 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
514
[12465]515 my $strIdentifyText = $self->useWget($strIdentifyCMD);
[11783]516
[12465]517 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
518 print STDERR "Server information is unavailable.\n";
519 print STDERR "<<Finished>>\n";
520 return;
521 }
522
[11783]523 print STDERR "General information:\n";
524 $self->parse_xml($strIdentifyText);
[16791]525 print STDERR "\n";
[11783]526
[16791]527 print STDERR "=" x 10, "\n";
528 print STDERR "Metadata Format Information (metadataPrefix):\n";
529 print STDERR "=" x 10, "\n";
530
531 my $strListMdFormatsCMD = $strBaseCMD;
532 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;
533 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
534
535 $self->parse_xml($strListMdFormatsText);
536 print STDERR "\n";
537
538 print STDERR "=" x 10, "\n";
539 print STDERR "List Information:\n";
540 print STDERR "=" x 10, "\n";
541
[11783]542 my $strListSetCMD = $strBaseCMD;
543 $strListSetCMD =~ s/_OPTS_/$strListSets/;
[12465]544 my $strListSetsText = $self->useWget($strListSetCMD);
545
[11783]546 $self->parse_xml($strListSetsText);
547}
548
549sub parse_xml
550{
551 my ($self) = shift (@_);
[16791]552 my ($xml_text) = @_;
[12465]553
[16791]554 #### change this to work directly from $xml_text
555
[11783]556 #Open a temporary file to store OAI information, and store the information to the temp file
[16791]557 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
[11783]558
[12465]559 open(*OAIOUT,"> $name");
560
[16791]561 print OAIOUT $xml_text;
[12465]562 close(OAIOUT);
563
[11783]564 $self->{'temp_file_name'} = $name;
565
[16791]566## print STDERR "**** xml text = $xml_text\n";
567
[11783]568 eval {
569 $self->{'parser'}->parsefile("$name");
[16791]570## $self->{'parser'}->parse($xml_text);
[11783]571 };
572
573 if ($@) {
[16791]574 die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
575## die "OAI: Parsed text is not a well formed XML file ($@)\n";
[11783]576 }
577
[16791]578 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
[11783]579}
580
[16791]581####END
582#{
583# if($self->{'info'})
584# {
585# unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
586# }
587#}
588
[11783]589# This Char function overrides the one in XML::Parser::Stream to overcome a
590# problem where $expat->{Text} is treated as the return value, slowing
591# things down significantly in some cases.
592sub Char {
593 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
594 $_[0]->{'Text'} .= $_[1];
[16791]595
596 my $self = $_[0]->{'PluginObj'};
[11783]597 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
598 $self->{'text'} .= $_[1];
[12465]599 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
[11783]600 if($self->{'text'} ne "")
601 {
602 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
603 }
604 }
605 return undef;
606}
607
608sub OAI_StartTag
609{
610 my ($expat, $element, %attr) = @_;
[12465]611
[16791]612 my $self = $expat->{'PluginObj'};
[11783]613 $self->{'subfield'} = $element;
[12465]614
[11783]615}
616
617sub OAI_EndTag
618{
619 my ($expat, $element) = @_;
[16791]620
621 my $self = $expat->{'PluginObj'};
[11783]622 $self->{'text'} = "";
623 $self->{'subfield'} = "";
624}
625
626sub error
627{
[12465]628 my ($self,$strFunctionName,$strError) = @_;
[11783]629 {
630 print "Error occoured in OAIDownload.pm\n".
631 "In Function:".$strFunctionName."\n".
632 "Error Message:".$strError."\n";
633 exit(-1);
634 }
635}
636
637
638
6391;
Note: See TracBrowser for help on using the repository browser.