source: main/trunk/greenstone2/perllib/downloaders/Z3950Download.pm@ 20999

Last change on this file since 20999 was 17284, checked in by ak19, 16 years ago

The PerlDoc seems to indicate that it is necessary to call waitpid after launching a childprocess with open2(), in that otherwise we might be stuck with zombie child processes. In any case, this was necessary in a standalone wget script that used open2(), and the addition of new code to do this in Z3940Download.pm seems to make the code work as before.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package Z3950Download;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use BaseDownload;
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40sub BEGIN {
41 @Z3950Download::ISA = ('BaseDownload');
42}
43
44my $arguments =
45 [ { 'name' => "host",
46 'disp' => "{Z3950Download.host_disp}",
47 'desc' => "{Z3950Download.host}",
48 'type' => "string",
49 'reqd' => "yes"},
50 { 'name' => "port",
51 'disp' => "{Z3950Download.port_disp}",
52 'desc' => "{Z3950Download.port}",
53 'type' => "string",
54 'reqd' => "yes"},
55 { 'name' => "database",
56 'disp' => "{Z3950Download.database_disp}",
57 'desc' => "{Z3950Download.database}",
58 'type' => "string",
59 'reqd' => "yes"},
60 { 'name' => "find",
61 'disp' => "{Z3950Download.find_disp}",
62 'desc' => "{Z3950Download.find}",
63 'type' => "string",
64 'deft' => "",
65 'reqd' => "yes"},
66 { 'name' => "max_records",
67 'disp' => "{Z3950Download.max_records_disp}",
68 'desc' => "{Z3950Download.max_records}",
69 'type' => "int",
70 'deft' => "500",
71 'reqd' => "no"}];
72
73my $options = { 'name' => "Z3950Download",
74 'desc' => "{Z3950Download.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
86 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87 push(@{$hashArgOptLists->{"OptList"}},$options);
88
89 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
97 $self->{'url'} = $self->{'host'}.":".$self->{'port'};
98
99 $self->{'yaz'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "yaz-client");
100
101 return bless $self, $class;
102
103}
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
110
111 my $url = $self->{'url'};
112
113 print STDERR "<<Defined Maximum>>\n";
114
115 $strOpen = $self->start_yaz($url);
116
117 print STDERR "Access database: \"$self->{'database'}\"\n";
118 $self->run_command_without_output("base $self->{'database'}");
119 print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
120 $intAmount = $self->findAmount($self->{'find'});
121
122 if($intAmount <= 0)
123 {
124 ($intAmount == -1)?
125 print STDERR "Something wrong with the arguments,downloading can not be performed\n":
126 print STDERR "No Record is found\n";
127 print STDERR "<<Finished>>\n";
128 return 0;
129 }
130 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
131 print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
132 $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords);
133
134 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
135 print STDERR "Closing connection...\n";
136
137 $self->quit_yaz();
138 return 1;
139}
140
141
142sub start_yaz
143{
144 my ($self, $url) = @_;
145
146 print STDERR "Opening connection to $url\n";
147
148 my $yaz = $self->{'yaz'};
149
150 my $childpid = open2(*YAZOUT, *YAZIN, $yaz)
151 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!");
152 $self->{'pid'} = $childpid;
153 $self->{'YAZOUT'} = *YAZOUT;
154 $self->{'YAZIN'} = *YAZIN;
155
156 my $strOpen = $self->open_connection("open $url");
157
158 if (!$strOpen) {
159 print STDERR "Cannot connect to $url\n";
160 print STDERR "<<Finished>>\n";
161 return 0;
162 }
163 return $strOpen;
164}
165
166sub quit_yaz
167{
168 my ($self) = shift (@_);
169
170 # can't send a "close" cmd to close the database here, since the close command is only
171 # recognised by Z3950, not by SRW. (This method is also used by the subclass for SRW.)
172
173 print STDERR "<<Finished>>\n";
174
175 # need to send the quit command, else yaz-client is still running in the background
176 $self->run_command_without_output("quit");
177 close($self->{'YAZIN'}); # close the input to yaz. It also flushes quit command to yaz.
178
179 # make sure nothing is being output by yaz
180 # flush the yaz-client process' outputstream, else we'll be stuck in an infinite
181 # loop waiting for the process to quit.
182 my $output = $self->{'YAZOUT'};
183 my $line;
184 while (defined ($line = <$output>)) {
185 if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
186 print STDERR "***### $line";
187 }
188 }
189
190 close($self->{'YAZOUT'});
191
192 # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
193 # says that waitpid must be called to "reap the child process", or otherwise it will hang
194 # around like a zombie process in the background. Adding it here makes the code work as
195 # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
196 # http://perldoc.perl.org/functions/waitpid.html
197 my $kidpid;
198 do {
199 $kidpid = waitpid($self->{'pid'}, WNOHANG);
200 } while $kidpid > 0; # waiting for pid to become -1
201}
202
203sub open_connection{
204 my ($self,$strCommand) = (@_);
205
206 $self->run_command($strCommand);
207
208 my $out = $self->{'YAZOUT'};
209
210 my $opening_line = <$out>;
211
212 return ($opening_line =~ m/Connecting...OK/i)? 1: 0;
213
214}
215
216sub findAmount
217{
218 my ($self) = shift (@_);
219 my($strFindTarget) = @_;
220 my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:");
221 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1;
222}
223
224sub getRecords
225{
226 my ($self) = shift (@_);
227 my ($intMaxRecords) = @_;
228 my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft);
229
230 $intStartNumber = 1;
231 $intRecordsLeft = $intMaxRecords;
232 $numRecords = 0;
233 $strResponse ="";
234
235 while ($intRecordsLeft > 0)
236 {
237 if($intRecordsLeft > 50)
238 {
239
240 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n";
241 $numRecords = 50;
242 $strShow = "show $intStartNumber+50";
243 $intStartNumber = $intStartNumber + 50;
244 $intRecordsLeft = $intRecordsLeft - 50;
245
246 }
247 else
248 {
249 $numRecords = $intRecordsLeft;
250 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n";
251 $strShow = "show $intStartNumber+$intRecordsLeft";
252 $intRecordsLeft = 0;
253
254 }
255
256 $strResponse .= $self->get($strShow,$numRecords);
257
258 if ($strResponse eq ""){
259 print STDERR "<<ERROR: failed to get $numRecords records>>\n";
260 }
261 else{
262 print STDERR "<<Done:$numRecords>>\n";
263 }
264 }
265
266 return "$strResponse\n";
267
268}
269
270sub saveRecords
271{
272 my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
273
274 # setup directories
275 # Currently only gather the MARC format
276 my $strFileName = $self->generateFileName($intMaxRecords);
277
278 $strOutputDir =~ s/"//g; #"
279
280 my $strOutputFile = &util::filename_cat($strOutputDir,$self->{'host'},"$strFileName.marc");
281 # prepare subdirectory for record (if needed)
282 my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
283
284 &util::mk_all_dir($strSubDirPath);
285
286 print STDERR "Saving records to \"$strOutputFile\"\n";
287
288 # save record
289 open (ZOUT,">$strOutputFile")
290 || die "Unable to save Z3950 record: $!\n";
291 print ZOUT $strRecords;
292 close(ZOUT);
293}
294
295
296sub run_command_with_output
297{
298 my ($self,$strCMD,$strStopRE) =@_;
299
300 $self->run_command($strCMD);
301
302 return $self->get_output($strStopRE);
303
304}
305
306sub get{
307 my ($self,$strShow,$numRecord) = @_;
308
309 $self->run_command($strShow);
310
311 my $strFullOutput="";
312 my $count=0;
313 my $readRecord = 0;
314
315 my $output = $self->{'YAZOUT'};
316 while (my $strLine = <$output>)
317 {
318
319 if ($strLine =~ m/Records: ([\d]*)/i ){
320 $readRecord = 1;
321 next;
322 }
323
324 return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected/i);
325
326 next if(!$readRecord);
327
328 $strFullOutput .= $strLine;
329 }
330
331}
332
333sub run_command_without_output
334{
335 my ($self) = shift (@_);
336 my ($strCMD) = @_;
337
338 $self->run_command($strCMD);
339}
340
341sub run_command
342{
343 my ($self,$strCMD) = @_;
344
345 my $input = $self->{'YAZIN'};
346
347 print $input "$strCMD\n";
348}
349
350sub get_output{
351 my ($self,$strStopRE) = @_;
352
353 if (!defined $strStopRE){return "";}
354 else
355 {
356 my $strFullOutput;
357 my $output = $self->{'YAZOUT'};
358 while (my $strLine = <$output>)
359 {
360 $strFullOutput .= $strLine;
361 if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;}
362 }
363 }
364}
365
366sub generateFileName
367{
368 my ($self,$intMaxRecords) = @_;
369 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords);
370
371}
372
373sub dirFileSplit
374{
375 my ($self,$strFile) = @_;
376
377 my @aryDirs = split(/[\/\\]/,$strFile);
378
379 my $strLocalFile = pop(@aryDirs);
380 my $strSubDirs = join("/",@aryDirs);
381
382 return ($strSubDirs,$strLocalFile);
383}
384
385sub url_information
386{
387 my ($self,$url) = @_;
388
389 $url = $self->{'url'} unless defined $url;
390
391 my $strOpen = $self->start_yaz();
392
393 $strOpen = $self->run_command_with_output("open $url","^Options");
394
395 $strOpen =~ s/Z> //g;
396 $strOpen =~ s/Elapsed:.*//g;
397
398 print STDERR $strOpen;
399
400 $self->quit_yaz();
401
402 return 0;
403
404}
405
406sub error
407{
408 my ($self,$strFunctionName,$strError) = @_;
409 {
410 print STDERR "Error occoured in Z3950Download.pm\n".
411 "In Function:".$strFunctionName."\n".
412 "Error Message:".$strError."\n";
413 exit(-1);
414 }
415}
416
4171;
Note: See TracBrowser for help on using the repository browser.