source: main/trunk/greenstone2/perllib/downloaders/Z3950Download.pm@ 24379

Last change on this file since 24379 was 24379, checked in by ak19, 13 years ago

Fixed several bugs in Z3950 download. The main bug had to do with how the Library of Congress is returning records in XML format, instead of whichever format other sources have been returning and for which the code so far was set up to work with. Now it works with the XML returned, however, the MARC record returned (which is in XML) does not explode properly. Other bugs have to do with the Server Information button in GLI failing for Z3950 and an ugly 2-level nested folder structure being created to store the files containing records returned: we don't want a folder called http.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
RevLine 
[11783]1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package Z3950Download;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
[17207]36use BaseDownload;
[11783]37use IPC::Open2;
[17284]38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
[11783]39
40sub BEGIN {
[17207]41 @Z3950Download::ISA = ('BaseDownload');
[11783]42}
43
44my $arguments =
45 [ { 'name' => "host",
46 'disp' => "{Z3950Download.host_disp}",
47 'desc' => "{Z3950Download.host}",
48 'type' => "string",
49 'reqd' => "yes"},
50 { 'name' => "port",
51 'disp' => "{Z3950Download.port_disp}",
52 'desc' => "{Z3950Download.port}",
53 'type' => "string",
54 'reqd' => "yes"},
55 { 'name' => "database",
56 'disp' => "{Z3950Download.database_disp}",
57 'desc' => "{Z3950Download.database}",
58 'type' => "string",
59 'reqd' => "yes"},
60 { 'name' => "find",
61 'disp' => "{Z3950Download.find_disp}",
62 'desc' => "{Z3950Download.find}",
63 'type' => "string",
64 'deft' => "",
65 'reqd' => "yes"},
66 { 'name' => "max_records",
67 'disp' => "{Z3950Download.max_records_disp}",
68 'desc' => "{Z3950Download.max_records}",
69 'type' => "int",
70 'deft' => "500",
71 'reqd' => "no"}];
72
73my $options = { 'name' => "Z3950Download",
74 'desc' => "{Z3950Download.desc}",
75 'abstract' => "no",
76 'inherits' => "yes",
77 'args' => $arguments };
78
79
80sub new
81{
82 my ($class) = shift (@_);
83 my ($getlist,$inputargs,$hashArgOptLists) = @_;
84 push(@$getlist, $class);
85
[17207]86 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87 push(@{$hashArgOptLists->{"OptList"}},$options);
[11783]88
[17207]89 my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
[11783]90
91 if ($self->{'info_only'}) {
92 # don't worry about any options etc
93 return bless $self, $class;
94 }
95
96 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
97 $self->{'url'} = $self->{'host'}.":".$self->{'port'};
[12465]98
[13019]99 $self->{'yaz'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "yaz-client");
[12465]100
[11783]101 return bless $self, $class;
102
103}
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
[12465]110
111 my $url = $self->{'url'};
112
[11783]113 print STDERR "<<Defined Maximum>>\n";
114
[17229]115 $strOpen = $self->start_yaz($url);
[12465]116
[11783]117 print STDERR "Access database: \"$self->{'database'}\"\n";
[12465]118 $self->run_command_without_output("base $self->{'database'}");
[11783]119 print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
[12465]120 $intAmount = $self->findAmount($self->{'find'});
[11783]121
122 if($intAmount <= 0)
123 {
124 ($intAmount == -1)?
[12465]125 print STDERR "Something wrong with the arguments,downloading can not be performed\n":
126 print STDERR "No Record is found\n";
127 print STDERR "<<Finished>>\n";
[11783]128 return 0;
129 }
130 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
131 print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
[12465]132 $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords);
133
134 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
[11783]135 print STDERR "Closing connection...\n";
[17229]136
137 $self->quit_yaz();
138 return 1;
139}
140
141
142sub start_yaz
143{
144 my ($self, $url) = @_;
145
146 print STDERR "Opening connection to $url\n";
147
148 my $yaz = $self->{'yaz'};
149
150 my $childpid = open2(*YAZOUT, *YAZIN, $yaz)
151 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!");
[17284]152 $self->{'pid'} = $childpid;
[17229]153 $self->{'YAZOUT'} = *YAZOUT;
154 $self->{'YAZIN'} = *YAZIN;
155
156 my $strOpen = $self->open_connection("open $url");
157
158 if (!$strOpen) {
159 print STDERR "Cannot connect to $url\n";
160 print STDERR "<<Finished>>\n";
161 return 0;
162 }
163 return $strOpen;
164}
165
166sub quit_yaz
167{
168 my ($self) = shift (@_);
169
[17232]170 # can't send a "close" cmd to close the database here, since the close command is only
171 # recognised by Z3950, not by SRW. (This method is also used by the subclass for SRW.)
172
[12465]173 print STDERR "<<Finished>>\n";
174
[17220]175 # need to send the quit command, else yaz-client is still running in the background
[17218]176 $self->run_command_without_output("quit");
[17229]177 close($self->{'YAZIN'}); # close the input to yaz. It also flushes quit command to yaz.
178
179 # make sure nothing is being output by yaz
[17231]180 # flush the yaz-client process' outputstream, else we'll be stuck in an infinite
181 # loop waiting for the process to quit.
[17229]182 my $output = $self->{'YAZOUT'};
183 my $line;
[17231]184 while (defined ($line = <$output>)) {
185 if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
186 print STDERR "***### $line";
187 }
[17229]188 }
189
190 close($self->{'YAZOUT'});
[17284]191
192 # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
193 # says that waitpid must be called to "reap the child process", or otherwise it will hang
194 # around like a zombie process in the background. Adding it here makes the code work as
195 # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
196 # http://perldoc.perl.org/functions/waitpid.html
197 my $kidpid;
198 do {
199 $kidpid = waitpid($self->{'pid'}, WNOHANG);
200 } while $kidpid > 0; # waiting for pid to become -1
[11783]201}
202
[12465]203sub open_connection{
204 my ($self,$strCommand) = (@_);
205
206 $self->run_command($strCommand);
207
208 my $out = $self->{'YAZOUT'};
209
[17229]210 my $opening_line = <$out>;
[12465]211
[17229]212 return ($opening_line =~ m/Connecting...OK/i)? 1: 0;
[12465]213
214}
215
[11783]216sub findAmount
217{
[12465]218 my ($self) = shift (@_);
[11783]219 my($strFindTarget) = @_;
[12465]220 my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:");
221 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1;
[11783]222}
223
224sub getRecords
225{
[12465]226 my ($self) = shift (@_);
[11783]227 my ($intMaxRecords) = @_;
[12465]228 my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft);
[11783]229
230 $intStartNumber = 1;
231 $intRecordsLeft = $intMaxRecords;
[12465]232 $numRecords = 0;
233 $strResponse ="";
234
[11783]235 while ($intRecordsLeft > 0)
236 {
237 if($intRecordsLeft > 50)
238 {
[12465]239
[11783]240 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n";
[12465]241 $numRecords = 50;
[11783]242 $strShow = "show $intStartNumber+50";
243 $intStartNumber = $intStartNumber + 50;
244 $intRecordsLeft = $intRecordsLeft - 50;
[12465]245
[11783]246 }
247 else
248 {
[12465]249 $numRecords = $intRecordsLeft;
[11783]250 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n";
251 $strShow = "show $intStartNumber+$intRecordsLeft";
252 $intRecordsLeft = 0;
[24379]253
254 }
[11783]255
[12465]256 $strResponse .= $self->get($strShow,$numRecords);
[24379]257
[12465]258 if ($strResponse eq ""){
259 print STDERR "<<ERROR: failed to get $numRecords records>>\n";
[11783]260 }
[12465]261 else{
262 print STDERR "<<Done:$numRecords>>\n";
263 }
[11783]264 }
[12465]265
266 return "$strResponse\n";
267
[11783]268}
269
270sub saveRecords
271{
272 my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
273
274 # setup directories
275 # Currently only gather the MARC format
[12465]276 my $strFileName = $self->generateFileName($intMaxRecords);
277
[24379]278 $strOutputDir =~ s/"//g; #"
[12465]279
[24379]280 # remove any http:// prefix from the hostname to generate folder-name to store records in
281 my $foldername = $self->{'host'};
282 $foldername =~ s@^http:\/\/(.*)@$1@;
283
284 my $strOutputFile = &util::filename_cat($strOutputDir,$foldername,"$strFileName.marc");
[12902]285 # prepare subdirectory for record (if needed)
286 my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
[12465]287
[11783]288 &util::mk_all_dir($strSubDirPath);
[12902]289
[11783]290 print STDERR "Saving records to \"$strOutputFile\"\n";
291
292 # save record
293 open (ZOUT,">$strOutputFile")
[12465]294 || die "Unable to save Z3950 record: $!\n";
[11783]295 print ZOUT $strRecords;
296 close(ZOUT);
297}
298
[12465]299
[11783]300sub run_command_with_output
301{
[12465]302 my ($self,$strCMD,$strStopRE) =@_;
303
304 $self->run_command($strCMD);
305
306 return $self->get_output($strStopRE);
307
[11783]308}
309
[12465]310sub get{
311 my ($self,$strShow,$numRecord) = @_;
312
313 $self->run_command($strShow);
314
315 my $strFullOutput="";
316 my $count=0;
317 my $readRecord = 0;
318
[17229]319 my $output = $self->{'YAZOUT'};
320 while (my $strLine = <$output>)
[12465]321 {
322 if ($strLine =~ m/Records: ([\d]*)/i ){
323 $readRecord = 1;
324 next;
[24379]325 } elsif ($strLine =~ m/<record/i){ # XML output such as from Library of Congress
326 $strFullOutput .= $strLine;
327 $readRecord = 1;
328 next;
[12465]329 }
330
[24379]331 if ($strLine =~ m/\<\/record\>/i) { # end of XML, include the closing tag and then terminate below
332 $strFullOutput .= $strLine;
333 }
334
335 return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected|\<\/record\>/i);
[12465]336
337 next if(!$readRecord);
338
339 $strFullOutput .= $strLine;
340 }
341
342}
343
[11783]344sub run_command_without_output
345{
[12465]346 my ($self) = shift (@_);
[11783]347 my ($strCMD) = @_;
348
[12465]349 $self->run_command($strCMD);
[11783]350}
351
352sub run_command
353{
[12465]354 my ($self,$strCMD) = @_;
355
356 my $input = $self->{'YAZIN'};
357
358 print $input "$strCMD\n";
359}
360
361sub get_output{
362 my ($self,$strStopRE) = @_;
363
[11783]364 if (!defined $strStopRE){return "";}
365 else
366 {
367 my $strFullOutput;
[12465]368 my $output = $self->{'YAZOUT'};
369 while (my $strLine = <$output>)
[11783]370 {
[12465]371 $strFullOutput .= $strLine;
372 if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;}
[11783]373 }
374 }
375}
376
377sub generateFileName
378{
379 my ($self,$intMaxRecords) = @_;
380 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords);
[12465]381
[11783]382}
383
384sub dirFileSplit
385{
[12465]386 my ($self,$strFile) = @_;
[11783]387
[13010]388 my @aryDirs = split(/[\/\\]/,$strFile);
[12465]389
[11783]390 my $strLocalFile = pop(@aryDirs);
391 my $strSubDirs = join("/",@aryDirs);
392
393 return ($strSubDirs,$strLocalFile);
394}
395
[12465]396sub url_information
397{
398 my ($self,$url) = @_;
399
400 $url = $self->{'url'} unless defined $url;
401
[24379]402 my $strOpen = $self->start_yaz($url);
[12465]403
[24379]404 $strOpen = $self->run_command_with_output("open $url","^Options|Connecting...OK.");
[12465]405
406 $strOpen =~ s/Z> //g;
407 $strOpen =~ s/Elapsed:.*//g;
408
409 print STDERR $strOpen;
410
[17229]411 $self->quit_yaz();
[12465]412
413 return 0;
414
415}
416
[11783]417sub error
418{
[12465]419 my ($self,$strFunctionName,$strError) = @_;
[11783]420 {
421 print STDERR "Error occoured in Z3950Download.pm\n".
422 "In Function:".$strFunctionName."\n".
423 "Error Message:".$strError."\n";
424 exit(-1);
425 }
426}
427
4281;
Note: See TracBrowser for help on using the repository browser.