source: main/trunk/greenstone2/perllib/downloaders/Z3950Download.pm@ 24379

Last change on this file since 24379 was 24379, checked in by ak19, 13 years ago

Fixed several bugs in Z3950 download. The main bug had to do with how the Library of Congress is returning records in XML format, instead of whichever format other sources have been returning and for which the code so far was set up to work with. Now it works with the XML returned, however, the MARC record returned (which is in XML) does not explode properly. Other bugs have to do with the Server Information button in GLI failing for Z3950 and an ugly 2-level nested folder structure being created to store the files containing records returned: we don't want a folder called http.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package Z3950Download;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use BaseDownload;
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40sub BEGIN {
41 @Z3950Download::ISA = ('BaseDownload');
42}
43
44my $arguments =
45 [ { 'name' => "host",
46 'disp' => "{Z3950Download.host_disp}",
47 'desc' => "{Z3950Download.host}",
48 'type' => "string",
49 'reqd' => "yes"},
50 { 'name' => "port",
51 'disp' => "{Z3950Download.port_disp}",
52 'desc' => "{Z3950Download.port}",
53 'type' => "string",
54 'reqd' => "yes"},
55 { 'name' => "database",
56 'disp' => "{Z3950Download.database_disp}",
57 'desc' => "{Z3950Download.database}",
58 'type' => "string",
59 'reqd' => "yes"},
60 { 'name' => "find",
61 'disp' => "{Z3950Download.find_disp}",
62 'desc' => "{Z3950Download.find}",
63 'type' => "string",
64 'deft' => "",
65 'reqd' => "yes"},
66 { 'name' => "max_records",
67 'disp' => "{Z3950Download.max_records_disp}",
68 'desc' => "{Z3950Download.max_records}",
69 'type' => "int",
70 'deft' => "500",
71 'reqd' => "no"}];
72
73my $options = { 'name' => "Z3950Download",
74 'desc' => "{Z3950Download.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
86 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87 push(@{$hashArgOptLists->{"OptList"}},$options);
88
89 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
97 $self->{'url'} = $self->{'host'}.":".$self->{'port'};
98
99 $self->{'yaz'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "yaz-client");
100
101 return bless $self, $class;
102
103}
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
110
111 my $url = $self->{'url'};
112
113 print STDERR "<<Defined Maximum>>\n";
114
115 $strOpen = $self->start_yaz($url);
116
117 print STDERR "Access database: \"$self->{'database'}\"\n";
118 $self->run_command_without_output("base $self->{'database'}");
119 print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
120 $intAmount = $self->findAmount($self->{'find'});
121
122 if($intAmount <= 0)
123 {
124 ($intAmount == -1)?
125 print STDERR "Something wrong with the arguments,downloading can not be performed\n":
126 print STDERR "No Record is found\n";
127 print STDERR "<<Finished>>\n";
128 return 0;
129 }
130 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
131 print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
132 $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords);
133
134 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
135 print STDERR "Closing connection...\n";
136
137 $self->quit_yaz();
138 return 1;
139}
140
141
142sub start_yaz
143{
144 my ($self, $url) = @_;
145
146 print STDERR "Opening connection to $url\n";
147
148 my $yaz = $self->{'yaz'};
149
150 my $childpid = open2(*YAZOUT, *YAZIN, $yaz)
151 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!");
152 $self->{'pid'} = $childpid;
153 $self->{'YAZOUT'} = *YAZOUT;
154 $self->{'YAZIN'} = *YAZIN;
155
156 my $strOpen = $self->open_connection("open $url");
157
158 if (!$strOpen) {
159 print STDERR "Cannot connect to $url\n";
160 print STDERR "<<Finished>>\n";
161 return 0;
162 }
163 return $strOpen;
164}
165
166sub quit_yaz
167{
168 my ($self) = shift (@_);
169
170 # can't send a "close" cmd to close the database here, since the close command is only
171 # recognised by Z3950, not by SRW. (This method is also used by the subclass for SRW.)
172
173 print STDERR "<<Finished>>\n";
174
175 # need to send the quit command, else yaz-client is still running in the background
176 $self->run_command_without_output("quit");
177 close($self->{'YAZIN'}); # close the input to yaz. It also flushes quit command to yaz.
178
179 # make sure nothing is being output by yaz
180 # flush the yaz-client process' outputstream, else we'll be stuck in an infinite
181 # loop waiting for the process to quit.
182 my $output = $self->{'YAZOUT'};
183 my $line;
184 while (defined ($line = <$output>)) {
185 if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
186 print STDERR "***### $line";
187 }
188 }
189
190 close($self->{'YAZOUT'});
191
192 # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
193 # says that waitpid must be called to "reap the child process", or otherwise it will hang
194 # around like a zombie process in the background. Adding it here makes the code work as
195 # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
196 # http://perldoc.perl.org/functions/waitpid.html
197 my $kidpid;
198 do {
199 $kidpid = waitpid($self->{'pid'}, WNOHANG);
200 } while $kidpid > 0; # waiting for pid to become -1
201}
202
203sub open_connection{
204 my ($self,$strCommand) = (@_);
205
206 $self->run_command($strCommand);
207
208 my $out = $self->{'YAZOUT'};
209
210 my $opening_line = <$out>;
211
212 return ($opening_line =~ m/Connecting...OK/i)? 1: 0;
213
214}
215
216sub findAmount
217{
218 my ($self) = shift (@_);
219 my($strFindTarget) = @_;
220 my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:");
221 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1;
222}
223
224sub getRecords
225{
226 my ($self) = shift (@_);
227 my ($intMaxRecords) = @_;
228 my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft);
229
230 $intStartNumber = 1;
231 $intRecordsLeft = $intMaxRecords;
232 $numRecords = 0;
233 $strResponse ="";
234
235 while ($intRecordsLeft > 0)
236 {
237 if($intRecordsLeft > 50)
238 {
239
240 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n";
241 $numRecords = 50;
242 $strShow = "show $intStartNumber+50";
243 $intStartNumber = $intStartNumber + 50;
244 $intRecordsLeft = $intRecordsLeft - 50;
245
246 }
247 else
248 {
249 $numRecords = $intRecordsLeft;
250 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n";
251 $strShow = "show $intStartNumber+$intRecordsLeft";
252 $intRecordsLeft = 0;
253
254 }
255
256 $strResponse .= $self->get($strShow,$numRecords);
257
258 if ($strResponse eq ""){
259 print STDERR "<<ERROR: failed to get $numRecords records>>\n";
260 }
261 else{
262 print STDERR "<<Done:$numRecords>>\n";
263 }
264 }
265
266 return "$strResponse\n";
267
268}
269
270sub saveRecords
271{
272 my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
273
274 # setup directories
275 # Currently only gather the MARC format
276 my $strFileName = $self->generateFileName($intMaxRecords);
277
278 $strOutputDir =~ s/"//g; #"
279
280 # remove any http:// prefix from the hostname to generate folder-name to store records in
281 my $foldername = $self->{'host'};
282 $foldername =~ s@^http:\/\/(.*)@$1@;
283
284 my $strOutputFile = &util::filename_cat($strOutputDir,$foldername,"$strFileName.marc");
285 # prepare subdirectory for record (if needed)
286 my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
287
288 &util::mk_all_dir($strSubDirPath);
289
290 print STDERR "Saving records to \"$strOutputFile\"\n";
291
292 # save record
293 open (ZOUT,">$strOutputFile")
294 || die "Unable to save Z3950 record: $!\n";
295 print ZOUT $strRecords;
296 close(ZOUT);
297}
298
299
300sub run_command_with_output
301{
302 my ($self,$strCMD,$strStopRE) =@_;
303
304 $self->run_command($strCMD);
305
306 return $self->get_output($strStopRE);
307
308}
309
310sub get{
311 my ($self,$strShow,$numRecord) = @_;
312
313 $self->run_command($strShow);
314
315 my $strFullOutput="";
316 my $count=0;
317 my $readRecord = 0;
318
319 my $output = $self->{'YAZOUT'};
320 while (my $strLine = <$output>)
321 {
322 if ($strLine =~ m/Records: ([\d]*)/i ){
323 $readRecord = 1;
324 next;
325 } elsif ($strLine =~ m/<record/i){ # XML output such as from Library of Congress
326 $strFullOutput .= $strLine;
327 $readRecord = 1;
328 next;
329 }
330
331 if ($strLine =~ m/\<\/record\>/i) { # end of XML, include the closing tag and then terminate below
332 $strFullOutput .= $strLine;
333 }
334
335 return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected|\<\/record\>/i);
336
337 next if(!$readRecord);
338
339 $strFullOutput .= $strLine;
340 }
341
342}
343
344sub run_command_without_output
345{
346 my ($self) = shift (@_);
347 my ($strCMD) = @_;
348
349 $self->run_command($strCMD);
350}
351
352sub run_command
353{
354 my ($self,$strCMD) = @_;
355
356 my $input = $self->{'YAZIN'};
357
358 print $input "$strCMD\n";
359}
360
361sub get_output{
362 my ($self,$strStopRE) = @_;
363
364 if (!defined $strStopRE){return "";}
365 else
366 {
367 my $strFullOutput;
368 my $output = $self->{'YAZOUT'};
369 while (my $strLine = <$output>)
370 {
371 $strFullOutput .= $strLine;
372 if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;}
373 }
374 }
375}
376
377sub generateFileName
378{
379 my ($self,$intMaxRecords) = @_;
380 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords);
381
382}
383
384sub dirFileSplit
385{
386 my ($self,$strFile) = @_;
387
388 my @aryDirs = split(/[\/\\]/,$strFile);
389
390 my $strLocalFile = pop(@aryDirs);
391 my $strSubDirs = join("/",@aryDirs);
392
393 return ($strSubDirs,$strLocalFile);
394}
395
396sub url_information
397{
398 my ($self,$url) = @_;
399
400 $url = $self->{'url'} unless defined $url;
401
402 my $strOpen = $self->start_yaz($url);
403
404 $strOpen = $self->run_command_with_output("open $url","^Options|Connecting...OK.");
405
406 $strOpen =~ s/Z> //g;
407 $strOpen =~ s/Elapsed:.*//g;
408
409 print STDERR $strOpen;
410
411 $self->quit_yaz();
412
413 return 0;
414
415}
416
417sub error
418{
419 my ($self,$strFunctionName,$strError) = @_;
420 {
421 print STDERR "Error occoured in Z3950Download.pm\n".
422 "In Function:".$strFunctionName."\n".
423 "Error Message:".$strError."\n";
424 exit(-1);
425 }
426}
427
4281;
Note: See TracBrowser for help on using the repository browser.