source: trunk/gsdl/perllib/downloaders/OAIDownload.pm@ 12465

Last change on this file since 12465 was 12465, checked in by shaoqun, 18 years ago

fixed th bugs on windows

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package OAIDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use WgetDownload;
37use XMLParser;
38
39use POSIX qw(tmpnam);
40use util;
41
42sub BEGIN {
43 @OAIDownload::ISA = ('WgetDownload');
44}
45
46my $arguments =
47 [ { 'name' => "url",
48 'disp' => "{OAIDownload.url_disp}",
49 'desc' => "{OAIDownload.url}",
50 'type' => "string",
51 'reqd' => "yes"},
52 { 'name' => "set",
53 'disp' => "{OAIDownload.set_disp}",
54 'desc' => "{OAIDownload.set}",
55 'type' => "string",
56 'reqd' => "no"},
57 { 'name' => "get_doc",
58 'disp' => "{OAIDownload.get_doc_disp}",
59 'desc' => "{OAIDownload.get_doc}",
60 'type' => "flag",
61 'reqd' => "no"},
62 { 'name' => "max_records",
63 'disp' => "{OAIDownload.max_records_disp}",
64 'desc' => "{OAIDownload.max_records}",
65 'type' => "int",
66 'deft' => "500",
67 'range' => "1,",
68 'reqd' => "no"} ];
69
70my $options = { 'name' => "OAIDownload",
71 'desc' => "{OAIDownload.desc}",
72 'abstract' => "no",
73 'inherits' => "yes",
74 'args' => $arguments };
75
76my $self;
77
78my $strWgetOptions="";
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
86 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
87 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
88
89 $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 my $parser = new XML::Parser('Style' => 'Stream',
97 'Handlers' => {'Char' => \&Char,
98 'Start' => \&OAI_StartTag,
99 'End' => \&OAI_EndTag
100 });
101 $self->{'parser'} = $parser;
102
103 return bless $self, $class;
104}
105
106sub download
107{
108 my ($self) = shift (@_);
109 my ($hashGeneralOptions) = @_;
110
111 print STDERR "here2";
112
113 $strWgetOptions = $self->getWgetOptions();
114 my $cmdWget = $strWgetOptions;
115
116 my $strOutputDir ="";
117 $strOutputDir = $hashGeneralOptions->{"cache_dir"};
118 my $strBasURL = $self->{'url'};
119 my $intMaxRecords = $self->{'max_records'};
120 my $blnDownloadDoc = $self->{'get_doc'};
121
122 print STDERR "<<Defined Maximum>>\n";
123
124 my $strIDs = $self->getOAIIDs($strBasURL);
125
126 if($strIDs eq "")
127 {
128 print STDERR "Error: No ID being found\n";
129 return 0;
130 }
131 my $aryIDs = $self->parseOAIIDs($strIDs);
132 my $intIDs = 0;
133 if($self->{'max_records'} < scalar(@$aryIDs))
134 {
135 $intIDs = $self->{'max_records'};
136 }
137 else
138 {
139 $intIDs = scalar(@$aryIDs);
140 }
141 print STDERR "<<Total number of record(s):$intIDs>>\n";
142
143 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
144
145 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
146 &util::rm($tmp_file);
147
148 return 1;
149}
150
151sub getOAIIDs
152{
153 my ($self,$strBasURL) = @_;
154 my ($cmdWget);
155
156 my $wgetOptions = $self->getWgetOptions();
157
158 $cmdWget = $wgetOptions;
159
160 print STDERR "Gathering OAI identifiers.....\n";
161
162 if($self->{'set'} ne "")
163 {
164 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" ";
165 }
166 else
167 {
168 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" ";
169 }
170
171
172 my $strIDs = $self->useWget($cmdWget);
173
174 if (!defined $strIDs or $strIDs eq "" ){
175 print STDERR "Server information is unavailable.\n";
176 print STDERR "<<Finished>>\n";
177 return;
178 }
179
180 print STDERR "<<Download Information>>\n";
181
182 $self->parse_xml($strIDs);
183
184 return $strIDs;
185}
186
187sub parseOAIIDs
188{
189 my ($self,$strIDs) = @_;
190
191 print STDERR "Parsing OAI identifiers.....\n";
192 $strIDs =~ s/^.*?<identifier>/<identifier>/s;
193 $strIDs =~ s/^(.*<\/identifier>).*$/$1/s;
194
195 my @aryIDs = ();
196
197 while ($strIDs =~ m/<identifier>(.*?)<\/identifier>(.*)$/s)
198 {
199 $strIDs = $2;
200 push(@aryIDs,$1);
201 }
202
203 return \@aryIDs;
204}
205
206sub dirFileSplit
207{
208 my ($self,$strFile) = @_;
209
210 my @aryDirs = split("[/\]",$strFile);
211
212 my $strLocalFile = pop(@aryDirs);
213 my $strSubDirs = join("/",@aryDirs);
214
215 return ($strSubDirs,$strLocalFile);
216}
217
218sub getOAIDoc
219{
220 my ($self,$strRecord, $strSubDirPath) = @_;
221
222 print STDERR "Gathering source documents.....\n";
223 # look out for identifier tag in metadata section
224
225 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s)
226 {
227 my $strMetaTag = $1;
228
229 if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
230 {
231 my $strDocURL = $2;
232
233 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
234
235 my $strSoureDirPath ="";
236
237 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
238
239 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath");
240
241 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
242
243 my $wget_cmd = $strWgetOptions." -q -O $strFullDocFilePath \"$strDocURL\"";
244
245 my $strResponse = $self->useWget($wget_cmd,1);
246
247 if($strResponse ne "")
248 {
249 print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
250 exit(-1);
251 }
252
253 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
254 }
255 else
256 {
257 print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
258 }
259 }
260 else
261 {
262 print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
263 }
264
265}
266
267sub getOAIRecords
268{
269 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;
270
271 my $intDocCounter = 0;
272
273 foreach my $strID ( @$aryIDs)
274 {
275 print STDERR "Gathering OAI record with ID:$strID.....\n";
276
277 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\"";
278
279 my $strRecord = $self->useWget($cmdWget);
280
281
282 my @fileDirs = split(":",$strID);
283
284 # setup directories
285
286 $strOutputDir =~ s/"//g;
287
288 my $strFileURL = "$strOutputDir/$fileDirs[0]/$fileDirs[1].oai";
289
290 # prepare subdirectory for record (if needed)
291 my ($strSubDirPath,$unused) = ("", "");
292
293 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL);
294
295 &util::mk_all_dir($strSubDirPath);
296
297 my $ds = &util::get_dirsep();
298
299 if($blnDownloadDoc)
300 {
301 $self->getOAIDoc($strRecord,$strSubDirPath);
302 }
303
304 # save record
305 open (OAIOUT,">$strFileURL")
306 || die "Unable to save oai metadata record: $!\n";
307 print OAIOUT $strRecord;
308 close(OAIOUT);
309
310 print STDERR "Saving records to $strFileURL\n";
311 print STDERR "<<Done>>\n";
312 $intDocCounter ++;
313 last if ($intDocCounter >= $intMaxRecords);
314 }
315
316 ($intDocCounter >= $intMaxRecords) ?
317 print STDERR "Reach maximum download records, use -max_records to set the maximum.\n":
318 print STDERR "Complete download meta record from $strBasURL\n";
319
320 print STDERR "<<Finished>>\n";
321}
322
323sub url_information
324{
325 my ($self) = shift (@_);
326 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";}
327
328 my $wgetOptions = $self->getWgetOptions();
329 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\"";
330
331 my $strIdentify = "verb=Identify";
332 my $strListSets = "verb=ListSets";
333
334 my $strIdentifyCMD = $strBaseCMD;
335 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/;
336
337 my $strIdentifyText = $self->useWget($strIdentifyCMD);
338
339 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
340 print STDERR "Server information is unavailable.\n";
341 print STDERR "<<Finished>>\n";
342 return;
343 }
344
345 print STDERR "General information:\n";
346 $self->parse_xml($strIdentifyText);
347
348 my $strListSetCMD = $strBaseCMD;
349 $strListSetCMD =~ s/_OPTS_/$strListSets/;
350 my $strListSetsText = $self->useWget($strListSetCMD);
351
352
353 print STDERR "List Information:\n";
354 $self->parse_xml($strListSetsText);
355}
356
357sub parse_xml
358{
359 my ($self) = shift (@_);
360 my ($strOutputText) = @_;
361
362 #Open a temporary file to store OAI information, and store the information to the temp file
363 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
364
365 open(*OAIOUT,"> $name");
366
367 print OAIOUT $strOutputText;
368 close(OAIOUT);
369
370 $self->{'temp_file_name'} = $name;
371
372 eval {
373 $self->{'parser'}->parsefile("$name");
374 };
375
376 if ($@) {
377 die "OAI: $name is not a well formed XML file ($@)\n";
378 }
379}
380
381END{
382 if($self->{'info'})
383 {
384 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
385 }
386}
387
388# This Char function overrides the one in XML::Parser::Stream to overcome a
389# problem where $expat->{Text} is treated as the return value, slowing
390# things down significantly in some cases.
391sub Char {
392 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
393 $_[0]->{'Text'} .= $_[1];
394 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
395 $self->{'text'} .= $_[1];
396 $self->{'text'} =~ s/[\n]|([ ]{2,})//g;
397 if($self->{'text'} ne "")
398 {
399 print STDERR " $self->{'subfield'}:($self->{'text'})\n";
400 }
401 }
402 return undef;
403}
404
405sub OAI_StartTag
406{
407 my ($expat, $element, %attr) = @_;
408
409 $self->{'subfield'} = $element;
410
411}
412
413sub OAI_EndTag
414{
415 my ($expat, $element) = @_;
416 $self->{'text'} = "";
417 $self->{'subfield'} = "";
418}
419
420sub error
421{
422 my ($self,$strFunctionName,$strError) = @_;
423 {
424 print "Error occoured in OAIDownload.pm\n".
425 "In Function:".$strFunctionName."\n".
426 "Error Message:".$strError."\n";
427 exit(-1);
428 }
429}
430
431
432
4331;
Note: See TracBrowser for help on using the repository browser.