source: gsdl/trunk/perllib/downloaders/Z3950Download.pm@ 17284

Last change on this file since 17284 was 17284, checked in by ak19, 16 years ago

The PerlDoc seems to indicate that it is necessary to call waitpid after launching a childprocess with open2(), in that otherwise we might be stuck with zombie child processes. In any case, this was necessary in a standalone wget script that used open2(), and the addition of new code to do this in Z3940Download.pm seems to make the code work as before.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
RevLine 
[11783]1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package Z3950Download;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
[17207]36use BaseDownload;
[11783]37use IPC::Open2;
[17284]38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
[11783]39
40sub BEGIN {
[17207]41 @Z3950Download::ISA = ('BaseDownload');
[11783]42}
43
44my $arguments =
45 [ { 'name' => "host",
46 'disp' => "{Z3950Download.host_disp}",
47 'desc' => "{Z3950Download.host}",
48 'type' => "string",
49 'reqd' => "yes"},
50 { 'name' => "port",
51 'disp' => "{Z3950Download.port_disp}",
52 'desc' => "{Z3950Download.port}",
53 'type' => "string",
54 'reqd' => "yes"},
55 { 'name' => "database",
56 'disp' => "{Z3950Download.database_disp}",
57 'desc' => "{Z3950Download.database}",
58 'type' => "string",
59 'reqd' => "yes"},
60 { 'name' => "find",
61 'disp' => "{Z3950Download.find_disp}",
62 'desc' => "{Z3950Download.find}",
63 'type' => "string",
64 'deft' => "",
65 'reqd' => "yes"},
66 { 'name' => "max_records",
67 'disp' => "{Z3950Download.max_records_disp}",
68 'desc' => "{Z3950Download.max_records}",
69 'type' => "int",
70 'deft' => "500",
71 'reqd' => "no"}];
72
73my $options = { 'name' => "Z3950Download",
74 'desc' => "{Z3950Download.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
[17207]86 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87 push(@{$hashArgOptLists->{"OptList"}},$options);
[11783]88
[17207]89 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
[11783]90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
97 $self->{'url'} = $self->{'host'}.":".$self->{'port'};
[12465]98
[13019]99 $self->{'yaz'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "yaz-client");
[12465]100
[11783]101 return bless $self, $class;
102
103}
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
[12465]110
111 my $url = $self->{'url'};
112
[11783]113 print STDERR "<<Defined Maximum>>\n";
114
[17229]115 $strOpen = $self->start_yaz($url);
[12465]116
[11783]117 print STDERR "Access database: \"$self->{'database'}\"\n";
[12465]118 $self->run_command_without_output("base $self->{'database'}");
[11783]119 print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
[12465]120 $intAmount = $self->findAmount($self->{'find'});
[11783]121
122 if($intAmount <= 0)
123 {
124 ($intAmount == -1)?
[12465]125 print STDERR "Something wrong with the arguments,downloading can not be performed\n":
126 print STDERR "No Record is found\n";
127 print STDERR "<<Finished>>\n";
[11783]128 return 0;
129 }
130 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
131 print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
[12465]132 $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords);
133
134 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
[11783]135 print STDERR "Closing connection...\n";
[17229]136
137 $self->quit_yaz();
138 return 1;
139}
140
141
142sub start_yaz
143{
144 my ($self, $url) = @_;
145
146 print STDERR "Opening connection to $url\n";
147
148 my $yaz = $self->{'yaz'};
149
150 my $childpid = open2(*YAZOUT, *YAZIN, $yaz)
151 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!");
[17284]152 $self->{'pid'} = $childpid;
[17229]153 $self->{'YAZOUT'} = *YAZOUT;
154 $self->{'YAZIN'} = *YAZIN;
155
156 my $strOpen = $self->open_connection("open $url");
157
158 if (!$strOpen) {
159 print STDERR "Cannot connect to $url\n";
160 print STDERR "<<Finished>>\n";
161 return 0;
162 }
163 return $strOpen;
164}
165
166sub quit_yaz
167{
168 my ($self) = shift (@_);
169
[17232]170 # can't send a "close" cmd to close the database here, since the close command is only
171 # recognised by Z3950, not by SRW. (This method is also used by the subclass for SRW.)
172
[12465]173 print STDERR "<<Finished>>\n";
174
[17220]175 # need to send the quit command, else yaz-client is still running in the background
[17218]176 $self->run_command_without_output("quit");
[17229]177 close($self->{'YAZIN'}); # close the input to yaz. It also flushes quit command to yaz.
178
179 # make sure nothing is being output by yaz
[17231]180 # flush the yaz-client process' outputstream, else we'll be stuck in an infinite
181 # loop waiting for the process to quit.
[17229]182 my $output = $self->{'YAZOUT'};
183 my $line;
[17231]184 while (defined ($line = <$output>)) {
185 if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
186 print STDERR "***### $line";
187 }
[17229]188 }
189
190 close($self->{'YAZOUT'});
[17284]191
192 # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
193 # says that waitpid must be called to "reap the child process", or otherwise it will hang
194 # around like a zombie process in the background. Adding it here makes the code work as
195 # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
196 # http://perldoc.perl.org/functions/waitpid.html
197 my $kidpid;
198 do {
199 $kidpid = waitpid($self->{'pid'}, WNOHANG);
200 } while $kidpid > 0; # waiting for pid to become -1
[11783]201}
202
[12465]203sub open_connection{
204 my ($self,$strCommand) = (@_);
205
206 $self->run_command($strCommand);
207
208 my $out = $self->{'YAZOUT'};
209
[17229]210 my $opening_line = <$out>;
[12465]211
[17229]212 return ($opening_line =~ m/Connecting...OK/i)? 1: 0;
[12465]213
214}
215
[11783]216sub findAmount
217{
[12465]218 my ($self) = shift (@_);
[11783]219 my($strFindTarget) = @_;
[12465]220 my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:");
221 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1;
[11783]222}
223
224sub getRecords
225{
[12465]226 my ($self) = shift (@_);
[11783]227 my ($intMaxRecords) = @_;
[12465]228 my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft);
[11783]229
230 $intStartNumber = 1;
231 $intRecordsLeft = $intMaxRecords;
[12465]232 $numRecords = 0;
233 $strResponse ="";
234
[11783]235 while ($intRecordsLeft > 0)
236 {
237 if($intRecordsLeft > 50)
238 {
[12465]239
[11783]240 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n";
[12465]241 $numRecords = 50;
[11783]242 $strShow = "show $intStartNumber+50";
243 $intStartNumber = $intStartNumber + 50;
244 $intRecordsLeft = $intRecordsLeft - 50;
[12465]245
[11783]246 }
247 else
248 {
[12465]249 $numRecords = $intRecordsLeft;
[11783]250 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n";
251 $strShow = "show $intStartNumber+$intRecordsLeft";
252 $intRecordsLeft = 0;
[12465]253
254 }
[11783]255
[12465]256 $strResponse .= $self->get($strShow,$numRecords);
257
258 if ($strResponse eq ""){
259 print STDERR "<<ERROR: failed to get $numRecords records>>\n";
[11783]260 }
[12465]261 else{
262 print STDERR "<<Done:$numRecords>>\n";
263 }
[11783]264 }
[12465]265
266 return "$strResponse\n";
267
[11783]268}
269
270sub saveRecords
271{
272 my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
273
274 # setup directories
275 # Currently only gather the MARC format
[12465]276 my $strFileName = $self->generateFileName($intMaxRecords);
277
[12587]278 $strOutputDir =~ s/"//g; #"
[12465]279
[12902]280 my $strOutputFile = &util::filename_cat($strOutputDir,$self->{'host'},"$strFileName.marc");
281 # prepare subdirectory for record (if needed)
282 my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
[12465]283
[11783]284 &util::mk_all_dir($strSubDirPath);
[12902]285
[11783]286 print STDERR "Saving records to \"$strOutputFile\"\n";
287
288 # save record
289 open (ZOUT,">$strOutputFile")
[12465]290 || die "Unable to save Z3950 record: $!\n";
[11783]291 print ZOUT $strRecords;
292 close(ZOUT);
293}
294
[12465]295
[11783]296sub run_command_with_output
297{
[12465]298 my ($self,$strCMD,$strStopRE) =@_;
299
300 $self->run_command($strCMD);
301
302 return $self->get_output($strStopRE);
303
[11783]304}
305
[12465]306sub get{
307 my ($self,$strShow,$numRecord) = @_;
308
309 $self->run_command($strShow);
310
311 my $strFullOutput="";
312 my $count=0;
313 my $readRecord = 0;
314
[17229]315 my $output = $self->{'YAZOUT'};
316 while (my $strLine = <$output>)
[12465]317 {
318
319 if ($strLine =~ m/Records: ([\d]*)/i ){
320 $readRecord = 1;
321 next;
322 }
323
324 return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected/i);
325
326 next if(!$readRecord);
327
328 $strFullOutput .= $strLine;
329 }
330
331}
332
[11783]333sub run_command_without_output
334{
[12465]335 my ($self) = shift (@_);
[11783]336 my ($strCMD) = @_;
337
[12465]338 $self->run_command($strCMD);
[11783]339}
340
341sub run_command
342{
[12465]343 my ($self,$strCMD) = @_;
344
345 my $input = $self->{'YAZIN'};
346
347 print $input "$strCMD\n";
348}
349
350sub get_output{
351 my ($self,$strStopRE) = @_;
352
[11783]353 if (!defined $strStopRE){return "";}
354 else
355 {
356 my $strFullOutput;
[12465]357 my $output = $self->{'YAZOUT'};
358 while (my $strLine = <$output>)
[11783]359 {
[12465]360 $strFullOutput .= $strLine;
361 if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;}
[11783]362 }
363 }
364}
365
366sub generateFileName
367{
368 my ($self,$intMaxRecords) = @_;
369 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords);
[12465]370
[11783]371}
372
373sub dirFileSplit
374{
[12465]375 my ($self,$strFile) = @_;
[11783]376
[13010]377 my @aryDirs = split(/[\/\\]/,$strFile);
[12465]378
[11783]379 my $strLocalFile = pop(@aryDirs);
380 my $strSubDirs = join("/",@aryDirs);
381
382 return ($strSubDirs,$strLocalFile);
383}
384
[12465]385sub url_information
386{
387 my ($self,$url) = @_;
388
389 $url = $self->{'url'} unless defined $url;
390
[17229]391 my $strOpen = $self->start_yaz();
[12465]392
393 $strOpen = $self->run_command_with_output("open $url","^Options");
394
395 $strOpen =~ s/Z> //g;
396 $strOpen =~ s/Elapsed:.*//g;
397
398 print STDERR $strOpen;
399
[17229]400 $self->quit_yaz();
[12465]401
402 return 0;
403
404}
405
[11783]406sub error
407{
[12465]408 my ($self,$strFunctionName,$strError) = @_;
[11783]409 {
410 print STDERR "Error occoured in Z3950Download.pm\n".
411 "In Function:".$strFunctionName."\n".
412 "Error Message:".$strError."\n";
413 exit(-1);
414 }
415}
416
4171;
Note: See TracBrowser for help on using the repository browser.