source: gsdl/trunk/perllib/downloaders/OAIDownload.pm@ 14179

Last change on this file since 14179 was 14179, checked in by xiao, 17 years ago

The directory arguments of wget command need to be quoted to work properly on Windows OS

  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "set",
53 'disp' => "{OAIDownload.set_disp}",
54 'desc' => "{OAIDownload.set}",
55 'type' => "string",
56 'reqd' => "no"},
57 { 'name' => "get_doc",
58 'disp' => "{OAIDownload.get_doc_disp}",
59 'desc' => "{OAIDownload.get_doc}",
60 'type' => "flag",
61 'reqd' => "no"},
62 { 'name' => "max_records",
63 'disp' => "{OAIDownload.max_records_disp}",
64 'desc' => "{OAIDownload.max_records}",
65 'type' => "int",
66 'deft' => "500",
67 'range' => "1,",
68 'reqd' => "no"} ];
69
70my $options = { 'name' => "OAIDownload",
71 'desc' => "{OAIDownload.desc}",
72 'abstract' => "no",
73 'inherits' => "yes",
74 'args' => $arguments };
75
76my $self;
77
78my $strWgetOptions="";
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
86 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
87 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
88
89 $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 my $parser = new XML::Parser('Style' => 'Stream',
97 'Handlers' => {'Char' => \&Char,
98 'Start' => \&OAI_StartTag,
99 'End' => \&OAI_EndTag
100 });
101 $self->{'parser'} = $parser;
102
103 # make sure the tmp directory that we will use later exists
104 my $tmp_dir = "$ENV{GSDLHOME}/tmp";
105 if (! -e $tmp_dir) {
106 &util::mk_dir($tmp_dir);
107 }
108
109 return bless $self, $class;
110}
111
112sub download
113{
114 my ($self) = shift (@_);
115 my ($hashGeneralOptions) = @_;
116
117 print STDERR "here2";
118
119 $strWgetOptions = $self->getWgetOptions();
120 my $cmdWget = $strWgetOptions;
121
122 my $strOutputDir ="";
123 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
124 my $strBasURL = $self->{'url'};
125 my $intMaxRecords = $self->{'max_records'};
126 my $blnDownloadDoc = $self->{'get_doc'};
127
128 print STDERR "<<Defined Maximum>>\n";
129
130 my $strIDs = $self->getOAIIDs($strBasURL);
131
132 if($strIDs eq "")
133 {
134 print STDERR "Error: No ID being found\n";
135 return 0;
136 }
137 my $aryIDs = $self->parseOAIIDs($strIDs);
138 my $intIDs = 0;
139 if($self->{'max_records'} < scalar(@$aryIDs))
140 {
141 $intIDs = $self->{'max_records'};
142 }
143 else
144 {
145 $intIDs = scalar(@$aryIDs);
146 }
147 print STDERR "<<Total number of record(s):$intIDs>>\n";
148
149 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
150
151 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
152 &util::rm($tmp_file);
153
154 return 1;
155}
156
157sub getOAIIDs
158{
159 my ($self,$strBasURL) = @_;
160 my ($cmdWget);
161
162 my $wgetOptions = $self->getWgetOptions();
163
164 $cmdWget = $wgetOptions;
165
166 print STDERR "Gathering OAI identifiers.....\n";
167
168 if($self->{'set'} ne "")
169 {
170 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" ";
171 }
172 else
173 {
174 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" ";
175 }
176
177
178 my $strIDs = $self->useWget($cmdWget);
179
180 if (!defined $strIDs or $strIDs eq "" ){
181 print STDERR "Server information is unavailable.\n";
182 print STDERR "<<Finished>>\n";
183 return;
184 }
185
186 print STDERR "<<Download Information>>\n";
187
188 $self->parse_xml($strIDs);
189
190 return $strIDs;
191}
192
193sub parseOAIIDs
194{
195 my ($self,$strIDs) = @_;
196
197 print STDERR "Parsing OAI identifiers.....\n";
198 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
199 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
200
201 my @aryIDs = ();
202
203 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
204 {
205 $strIDs = $2;
206 push(@aryIDs,$1);
207 }
208
209 return \@aryIDs;
210}
211
212sub dirFileSplit
213{
214 my ($self,$strFile) = @_;
215
216 my @aryDirs = split("[/\]",$strFile);
217
218 my $strLocalFile = pop(@aryDirs);
219 my $strSubDirs = join("/",@aryDirs);
220
221 return ($strSubDirs,$strLocalFile);
222}
223
224sub getOAIDoc
225{
226 my ($self,$strRecord, $strSubDirPath) = @_;
227
228 print STDERR "Gathering source documents.....\n";
229 # look out for identifier tag in metadata section
230
231 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
232 {
233 my $strMetaTag = $1;
234
235 if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
236 {
237 my $strDocURL = $2;
238
239 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
240
241 my $strSoureDirPath ="";
242
243 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
244
245 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath");
246
247 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
248
249 my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\"";
250
251 my $strResponse = $self->useWget($wget_cmd,1);
252
253 if($strResponse ne "")
254 {
255 print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
256 exit(-1);
257 }
258
259 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
260 }
261 else
262 {
263 print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
264 }
265 }
266 else
267 {
268 print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
269 }
270
271}
272
273sub getOAIRecords
274{
275 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
276
277 my $intDocCounter = 0;
278
279 foreach my $strID ( @$aryIDs)
280 {
281 print STDERR "Gathering OAI record with ID:$strID.....\n";
282
283 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\"";
284
285 my $strRecord = $self->useWget($cmdWget);
286
287
288 my @fileDirs = split(":",$strID);
289
290 # setup directories
291
292 $strOutputDir =~ s/"//g; #"
293
294 my $host =$self->{'url'};
295
296 $host =~ s/http:\/\///g;
297
298 $host =~ s/:.*//g;
299
300 my $midDir = join ("/",@fileDirs);
301 my $strFileURL = "$strOutputDir/$host/".$midDir.".oai";
302
303 # prepare subdirectory for record (if needed)
304 my ($strSubDirPath,$unused) = ("", "");
305
306 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
307
308 &util::mk_all_dir($strSubDirPath);
309
310 my $ds = &util::get_dirsep();
311
312 if($blnDownloadDoc)
313 {
314 $self->getOAIDoc($strRecord,$strSubDirPath);
315 }
316
317 # save record
318 open (OAIOUT,">$strFileURL")
319 || die "Unable to save oai metadata record: $!\n";
320 print OAIOUT $strRecord;
321 close(OAIOUT);
322
323 print STDERR "Saving records to $strFileURL\n";
324 print STDERR "<<Done>>\n";
325 $intDocCounter ++;
326 last if ($intDocCounter >= $intMaxRecords);
327 }
328
329 ($intDocCounter >= $intMaxRecords) ?
330 print STDERR "Reach maximum download records, use -max_records to set the maximum.\n":
331 print STDERR "Complete download meta record from $strBasURL\n";
332
333 print STDERR "<<Finished>>\n";
334}
335
336sub url_information
337{
338 my ($self) = shift (@_);
339 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
340
341 my $wgetOptions = $self->getWgetOptions();
342 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
343
344 my $strIdentify = "verb=Identify";
345 my $strListSets = "verb=ListSets";
346
347 my $strIdentifyCMD = $strBaseCMD;
348 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
349
350 my $strIdentifyText = $self->useWget($strIdentifyCMD);
351
352 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
353 print STDERR "Server information is unavailable.\n";
354 print STDERR "<<Finished>>\n";
355 return;
356 }
357
358 print STDERR "General information:\n";
359 $self->parse_xml($strIdentifyText);
360
361 my $strListSetCMD = $strBaseCMD;
362 $strListSetCMD =~ s/_OPTS_/$strListSets/;
363 my $strListSetsText = $self->useWget($strListSetCMD);
364
365
366 print STDERR "List Information:\n";
367 $self->parse_xml($strListSetsText);
368}
369
370sub parse_xml
371{
372 my ($self) = shift (@_);
373 my ($strOutputText) = @_;
374
375 #Open a temporary file to store OAI information, and store the information to the temp file
376 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
377
378 open(*OAIOUT,"> $name");
379
380 print OAIOUT $strOutputText;
381 close(OAIOUT);
382
383 $self->{'temp_file_name'} = $name;
384
385 eval {
386 $self->{'parser'}->parsefile("$name");
387 };
388
389 if ($@) {
390 die "OAI: $name is not a well formed XML file ($@)\n";
391 }
392}
393
394END{
395 if($self->{'info'})
396 {
397 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
398 }
399}
400
401# This Char function overrides the one in XML::Parser::Stream to overcome a
402# problem where $expat->{Text} is treated as the return value, slowing
403# things down significantly in some cases.
404sub Char {
405 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
406 $_[0]->{'Text'} .= $_[1];
407 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
408 $self->{'text'} .= $_[1];
409 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
410 if($self->{'text'} ne "")
411 {
412 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
413 }
414 }
415 return undef;
416}
417
418sub OAI_StartTag
419{
420 my ($expat, $element, %attr) = @_;
421
422 $self->{'subfield'} = $element;
423
424}
425
426sub OAI_EndTag
427{
428 my ($expat, $element) = @_;
429 $self->{'text'} = "";
430 $self->{'subfield'} = "";
431}
432
433sub error
434{
435 my ($self,$strFunctionName,$strError) = @_;
436 {
437 print "Error occoured in OAIDownload.pm\n".
438 "In Function:".$strFunctionName."\n".
439 "Error Message:".$strError."\n";
440 exit(-1);
441 }
442}
443
444
445
4461;
Note: See TracBrowser for help on using the repository browser.