source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 14926

Last change on this file since 14926 was 14926, checked in by dmn, 16 years ago

essagedavidb gs3 building updates

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
RevLine 
[11783]1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
[12465]40use util;
[11783]41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "set",
53 'disp' => "{OAIDownload.set_disp}",
54 'desc' => "{OAIDownload.set}",
55 'type' => "string",
56 'reqd' => "no"},
57 { 'name' => "get_doc",
[12465]58 'disp' => "{OAIDownload.get_doc_disp}",
[11783]59 'desc' => "{OAIDownload.get_doc}",
60 'type' => "flag",
61 'reqd' => "no"},
62 { 'name' => "max_records",
63 'disp' => "{OAIDownload.max_records_disp}",
64 'desc' => "{OAIDownload.max_records}",
65 'type' => "int",
66 'deft' => "500",
67 'range' => "1,",
68 'reqd' => "no"} ];
69
70my $options = { 'name' => "OAIDownload",
71 'desc' => "{OAIDownload.desc}",
72 'abstract' => "no",
73 'inherits' => "yes",
74 'args' => $arguments };
75
76my $self;
77
[12465]78my $strWgetOptions="";
79
[11783]80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
86 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
87 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
88
89 $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 my $parser = new XML::Parser('Style' => 'Stream',
97 'Handlers' => {'Char' => \&Char,
98 'Start' => \&OAI_StartTag,
99 'End' => \&OAI_EndTag
100 });
101 $self->{'parser'} = $parser;
102
[13961]103 # make sure the tmp directory that we will use later exists
104 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
105 if (! -e $tmp_dir) {
106 &util::mk_dir($tmp_dir);
107 }
108
[11783]109 return bless $self, $class;
110}
111
112sub download
113{
114 my ($self) = shift (@_);
115 my ($hashGeneralOptions) = @_;
116
[12465]117 print STDERR "here2";
118
119 $strWgetOptions = $self->getWgetOptions();
120 my $cmdWget = $strWgetOptions;
121
122 my $strOutputDir ="";
123 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
[11783]124 my $strBasURL = $self->{'url'};
125 my $intMaxRecords = $self->{'max_records'};
126 my $blnDownloadDoc = $self->{'get_doc'};
127
128 print STDERR "<<Defined Maximum>>\n";
[12465]129
130 my $strIDs = $self->getOAIIDs($strBasURL);
131
132 if($strIDs eq "")
[11783]133 {
134 print STDERR "Error: No ID being found\n";
135 return 0;
136 }
[12465]137 my $aryIDs = $self->parseOAIIDs($strIDs);
[11783]138 my $intIDs = 0;
139 if($self->{'max_records'} < scalar(@$aryIDs))
140 {
141 $intIDs = $self->{'max_records'};
142 }
143 else
144 {
145 $intIDs = scalar(@$aryIDs);
146 }
147 print STDERR "<<Total number of record(s):$intIDs>>\n";
148
[12465]149 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
[11783]150
[12465]151 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
152 &util::rm($tmp_file);
153
[11783]154 return 1;
155}
156
157sub getOAIIDs
158{
159 my ($self,$strBasURL) = @_;
160 my ($cmdWget);
[12465]161
162 my $wgetOptions = $self->getWgetOptions();
163
164 $cmdWget = $wgetOptions;
165
[11783]166 print STDERR "Gathering OAI identifiers.....\n";
[12465]167
[11783]168 if($self->{'set'} ne "")
169 {
[12465]170 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" ";
[11783]171 }
172 else
173 {
[12465]174 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" ";
[11783]175 }
[12465]176
177
178 my $strIDs = $self->useWget($cmdWget);
179
180 if (!defined $strIDs or $strIDs eq "" ){
181 print STDERR "Server information is unavailable.\n";
182 print STDERR "<<Finished>>\n";
183 return;
184 }
185
186 print STDERR "<<Download Information>>\n";
187
188 $self->parse_xml($strIDs);
189
[11783]190 return $strIDs;
191}
192
193sub parseOAIIDs
194{
[12465]195 my ($self,$strIDs) = @_;
[11783]196
197 print STDERR "Parsing OAI identifiers.....\n";
198 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
199 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
200
201 my @aryIDs = ();
202
203 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
204 {
205 $strIDs = $2;
206 push(@aryIDs,$1);
207 }
208
209 return \@aryIDs;
210}
211
212sub dirFileSplit
213{
[12465]214 my ($self,$strFile) = @_;
[11783]215
[12465]216 my @aryDirs = split("[/\]",$strFile);
217
[11783]218 my $strLocalFile = pop(@aryDirs);
219 my $strSubDirs = join("/",@aryDirs);
220
221 return ($strSubDirs,$strLocalFile);
222}
223
224sub getOAIDoc
225{
[12465]226 my ($self,$strRecord, $strSubDirPath) = @_;
227
[11783]228 print STDERR "Gathering source documents.....\n";
229 # look out for identifier tag in metadata section
[12465]230
[11783]231 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
232 {
233 my $strMetaTag = $1;
234
235 if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
236 {
237 my $strDocURL = $2;
238
[12465]239 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
[11783]240
[12465]241 my $strSoureDirPath ="";
242
243 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
244
[11783]245 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath");
246
247 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
248
[14179]249 my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\"";
[11783]250
[12465]251 my $strResponse = $self->useWget($wget_cmd,1);
[11783]252
253 if($strResponse ne "")
254 {
255 print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
256 exit(-1);
257 }
258
259 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
260 }
261 else
262 {
263 print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
264 }
265 }
266 else
267 {
268 print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
269 }
270
271}
272
273sub getOAIRecords
274{
[12465]275 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
[11783]276
277 my $intDocCounter = 0;
278
279 foreach my $strID ( @$aryIDs)
280 {
281 print STDERR "Gathering OAI record with ID:$strID.....\n";
[12465]282
283 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\"";
[11783]284
[12465]285 my $strRecord = $self->useWget($cmdWget);
286
287
288 my @fileDirs = split(":",$strID);
289
[11783]290 # setup directories
291
[12580]292 $strOutputDir =~ s/"//g; #"
[13065]293
294 my $host =$self->{'url'};
[12465]295
[13065]296 $host =~ s/http:\/\///g;
297
298 $host =~ s/:.*//g;
299
[14179]300 my $midDir = join ("/",@fileDirs);
301 my $strFileURL = "$strOutputDir/$host/".$midDir.".oai";
302
[11783]303 # prepare subdirectory for record (if needed)
[12465]304 my ($strSubDirPath,$unused) = ("", "");
305
306 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
307
[11783]308 &util::mk_all_dir($strSubDirPath);
309
310 my $ds = &util::get_dirsep();
[12465]311
[11783]312 if($blnDownloadDoc)
313 {
[12465]314 $self->getOAIDoc($strRecord,$strSubDirPath);
[11783]315 }
316
317 # save record
[12465]318 open (OAIOUT,">$strFileURL")
[11783]319 || die "Unable to save oai metadata record: $!\n";
320 print OAIOUT $strRecord;
321 close(OAIOUT);
322
[12465]323 print STDERR "Saving records to $strFileURL\n";
324 print STDERR "<<Done>>\n";
325 $intDocCounter ++;
[11783]326 last if ($intDocCounter >= $intMaxRecords);
327 }
[12465]328
[11783]329 ($intDocCounter >= $intMaxRecords) ?
[14926]330 print STDERR "Reached maximum download records, use -max_records to set the maximum.\n":
[11783]331 print STDERR "Complete download meta record from $strBasURL\n";
332
[12465]333 print STDERR "<<Finished>>\n";
[11783]334}
335
336sub url_information
337{
338 my ($self) = shift (@_);
339 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
340
[12465]341 my $wgetOptions = $self->getWgetOptions();
342 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
[11783]343
344 my $strIdentify = "verb=Identify";
345 my $strListSets = "verb=ListSets";
346
347 my $strIdentifyCMD = $strBaseCMD;
348 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
349
[12465]350 my $strIdentifyText = $self->useWget($strIdentifyCMD);
[11783]351
[12465]352 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
353 print STDERR "Server information is unavailable.\n";
354 print STDERR "<<Finished>>\n";
355 return;
356 }
357
[11783]358 print STDERR "General information:\n";
359 $self->parse_xml($strIdentifyText);
360
361 my $strListSetCMD = $strBaseCMD;
362 $strListSetCMD =~ s/_OPTS_/$strListSets/;
[12465]363 my $strListSetsText = $self->useWget($strListSetCMD);
364
365
[11783]366 print STDERR "List Information:\n";
367 $self->parse_xml($strListSetsText);
368}
369
370sub parse_xml
371{
372 my ($self) = shift (@_);
373 my ($strOutputText) = @_;
[12465]374
[11783]375 #Open a temporary file to store OAI information, and store the information to the temp file
[12465]376 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
[11783]377
[12465]378 open(*OAIOUT,"> $name");
379
380 print OAIOUT $strOutputText;
381 close(OAIOUT);
382
[11783]383 $self->{'temp_file_name'} = $name;
384
385 eval {
386 $self->{'parser'}->parsefile("$name");
387 };
388
389 if ($@) {
390 die "OAI: $name is not a well formed XML file ($@)\n";
391 }
392}
393
394END{
395 if($self->{'info'})
396 {
397 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
398 }
399}
400
401# This Char function overrides the one in XML::Parser::Stream to overcome a
402# problem where $expat->{Text} is treated as the return value, slowing
403# things down significantly in some cases.
404sub Char {
405 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
406 $_[0]->{'Text'} .= $_[1];
407 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
408 $self->{'text'} .= $_[1];
[12465]409 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
[11783]410 if($self->{'text'} ne "")
411 {
412 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
413 }
414 }
415 return undef;
416}
417
418sub OAI_StartTag
419{
420 my ($expat, $element, %attr) = @_;
[12465]421
[11783]422 $self->{'subfield'} = $element;
[12465]423
[11783]424}
425
426sub OAI_EndTag
427{
428 my ($expat, $element) = @_;
429 $self->{'text'} = "";
430 $self->{'subfield'} = "";
431}
432
433sub error
434{
[12465]435 my ($self,$strFunctionName,$strError) = @_;
[11783]436 {
437 print "Error occoured in OAIDownload.pm\n".
438 "In Function:".$strFunctionName."\n".
439 "Error Message:".$strError."\n";
440 exit(-1);
441 }
442}
443
444
445
4461;
Note: See TracBrowser for help on using the repository browser.