root/main/trunk/greenstone2/perllib/downloaders/Z3950Download.pm @ 24379

Revision 24379, 11.1 KB (checked in by ak19, 9 years ago)

Fixed several bugs in Z3950 download. The main bug had to do with how the Library of Congress is returning records in XML format, instead of whichever format other sources have been returning and for which the code so far was set up to work with. Now it works with the XML returned, however, the MARC record returned (which is in XML) does not explode properly. Other bugs have to do with the Server Information button in GLI failing for Z3950 and an ugly 2-level nested folder structure being created to store the files containing records returned: we don't want a folder called http.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# WebDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package Z3950Download;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use BaseDownload;
37use IPC::Open2;
38use POSIX ":sys_wait_h"; # for waitpid, http://perldoc.perl.org/functions/waitpid.html
39
40sub BEGIN {
41    @Z3950Download::ISA = ('BaseDownload');
42}
43
44my $arguments =
45    [  { 'name' => "host",
46    'disp' => "{Z3950Download.host_disp}",
47    'desc' => "{Z3950Download.host}",
48    'type' => "string",
49    'reqd' => "yes"},
50      { 'name' => "port",
51    'disp' => "{Z3950Download.port_disp}",
52    'desc' => "{Z3950Download.port}",
53    'type' => "string",
54    'reqd' => "yes"},
55      { 'name' => "database",
56    'disp' => "{Z3950Download.database_disp}",
57    'desc' => "{Z3950Download.database}",
58        'type' => "string",
59    'reqd' => "yes"},
60      { 'name' => "find",
61    'disp' => "{Z3950Download.find_disp}",
62    'desc' => "{Z3950Download.find}",
63    'type' => "string",
64    'deft' => "",
65    'reqd' => "yes"},
66      { 'name' => "max_records",
67    'disp' => "{Z3950Download.max_records_disp}",
68    'desc' => "{Z3950Download.max_records}",
69    'type' => "int",
70    'deft' => "500",
71    'reqd' => "no"}];
72
73my $options = { 'name'     => "Z3950Download",
74        'desc'     => "{Z3950Download.desc}",
75        'abstract' => "no",
76        'inherits' => "yes",
77        'args'     => $arguments };
78
79
80sub new
81{
82    my ($class) = shift (@_);
83    my ($getlist,$inputargs,$hashArgOptLists) = @_;
84    push(@$getlist, $class);
85
86    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
87    push(@{$hashArgOptLists->{"OptList"}},$options);
88
89    my $self = new BaseDownload($getlist,$inputargs,$hashArgOptLists);
90
91    if ($self->{'info_only'}) {
92    # don't worry about any options etc
93    return bless $self, $class;
94    }
95
96    # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
97    $self->{'url'} = $self->{'host'}.":".$self->{'port'};
98
99    $self->{'yaz'} = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "yaz-client");
100   
101    return bless $self, $class;
102
103}
104
105sub download
106{
107    my ($self) = shift (@_);
108    my ($hashGeneralOptions) = @_;
109    my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
110
111    my $url = $self->{'url'};
112 
113    print STDERR "<<Defined Maximum>>\n";
114 
115    $strOpen = $self->start_yaz($url);
116
117    print STDERR "Access database: \"$self->{'database'}\"\n";
118    $self->run_command_without_output("base $self->{'database'}");
119    print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
120    $intAmount = $self->findAmount($self->{'find'});
121
122    if($intAmount <= 0)
123    {
124    ($intAmount == -1)?
125        print STDERR "Something wrong with the arguments,downloading can not be performed\n":
126        print STDERR "No Record is found\n";
127    print STDERR "<<Finished>>\n";
128    return 0;
129    }
130    $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
131    print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
132    $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords);
133   
134    $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
135    print STDERR "Closing connection...\n";
136
137    $self->quit_yaz();
138    return 1;
139}
140
141
142sub start_yaz
143{
144    my ($self, $url) = @_;   
145
146    print STDERR "Opening connection to $url\n";
147     
148    my $yaz = $self->{'yaz'};
149   
150    my $childpid = open2(*YAZOUT, *YAZIN, $yaz)
151    or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!");
152    $self->{'pid'} = $childpid;
153    $self->{'YAZOUT'} = *YAZOUT;
154    $self->{'YAZIN'} = *YAZIN;
155
156    my $strOpen = $self->open_connection("open $url"); 
157
158    if (!$strOpen) {
159        print STDERR "Cannot connect to $url\n";
160        print STDERR "<<Finished>>\n"; 
161    return 0;
162    }
163    return $strOpen;
164}
165
166sub quit_yaz
167{
168    my ($self) = shift (@_);
169
170    # can't send a "close" cmd to close the database here, since the close command is only
171    # recognised by Z3950, not by SRW. (This method is also used by the subclass for SRW.)
172
173    print STDERR "<<Finished>>\n";
174
175    # need to send the quit command, else yaz-client is still running in the background
176    $self->run_command_without_output("quit");
177    close($self->{'YAZIN'}); # close the input to yaz. It also flushes quit command to yaz.
178
179    # make sure nothing is being output by yaz
180    # flush the yaz-client process' outputstream, else we'll be stuck in an infinite
181    # loop waiting for the process to quit.
182    my $output = $self->{'YAZOUT'};
183    my $line;
184    while (defined ($line = <$output>)) {
185    if($line !~ m/\w/s) { # print anything other than plain whitespace in case it is important
186        print STDERR "***### $line";
187    }
188    }
189
190    close($self->{'YAZOUT'});
191
192    # Is the following necessary? The PerlDoc on open2 (http://perldoc.perl.org/IPC/Open2.html)
193    # says that waitpid must be called to "reap the child process", or otherwise it will hang
194    # around like a zombie process in the background. Adding it here makes the code work as
195    # before, but it is certainly necessary to call waitpid on wget (see WgetDownload.pm).
196    # http://perldoc.perl.org/functions/waitpid.html
197    my $kidpid;
198    do {
199    $kidpid = waitpid($self->{'pid'}, WNOHANG);
200    } while $kidpid > 0; # waiting for pid to become -1
201}
202
203sub open_connection{
204  my ($self,$strCommand) =  (@_);
205 
206  $self->run_command($strCommand); 
207
208  my $out = $self->{'YAZOUT'};
209
210  my $opening_line = <$out>;
211 
212  return ($opening_line =~ m/Connecting...OK/i)? 1: 0;
213 
214}
215
216sub findAmount
217{
218    my ($self) = shift (@_);
219    my($strFindTarget) = @_;
220    my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:");
221   return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1;   
222}
223
224sub getRecords
225{
226    my ($self) = shift (@_);
227    my ($intMaxRecords) = @_;
228    my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft);
229
230    $intStartNumber = 1;
231    $intRecordsLeft = $intMaxRecords;
232    $numRecords = 0;
233    $strResponse ="";
234
235    while ($intRecordsLeft > 0)
236    {
237    if($intRecordsLeft > 50)
238    {
239       
240        print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n";
241        $numRecords = 50;
242        $strShow = "show $intStartNumber+50";
243        $intStartNumber = $intStartNumber + 50;
244        $intRecordsLeft = $intRecordsLeft - 50;
245             
246    }
247    else
248    {
249        $numRecords = $intRecordsLeft;
250        print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n";
251        $strShow = "show $intStartNumber+$intRecordsLeft";
252        $intRecordsLeft = 0;
253       
254    }
255   
256    $strResponse .= $self->get($strShow,$numRecords);
257
258    if ($strResponse eq ""){
259        print STDERR "<<ERROR: failed to get $numRecords records>>\n";
260    }
261    else{
262        print STDERR "<<Done:$numRecords>>\n";
263    }
264    }
265
266    return  "$strResponse\n";
267   
268}
269
270sub saveRecords
271{
272    my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
273
274    # setup directories
275    # Currently only gather the MARC format
276    my $strFileName = $self->generateFileName($intMaxRecords);
277
278    $strOutputDir =~ s/"//g; #"
279
280    # remove any http:// prefix from the hostname to generate folder-name to store records in
281    my $foldername = $self->{'host'};
282    $foldername =~ s@^http:\/\/(.*)@$1@;
283
284    my $strOutputFile = &util::filename_cat($strOutputDir,$foldername,"$strFileName.marc");
285     # prepare subdirectory for record (if needed)
286    my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
287 
288    &util::mk_all_dir($strSubDirPath);
289 
290    print STDERR "Saving records to \"$strOutputFile\"\n";
291
292    # save record
293    open (ZOUT,">$strOutputFile")
294    || die "Unable to save Z3950 record: $!\n";
295    print ZOUT $strRecords;
296    close(ZOUT);
297}
298
299
300sub run_command_with_output
301{
302    my ($self,$strCMD,$strStopRE) =@_;
303   
304    $self->run_command($strCMD);
305   
306    return $self->get_output($strStopRE);
307 
308}
309
310sub get{
311   my ($self,$strShow,$numRecord) = @_; 
312
313   $self->run_command($strShow); 
314   
315   my $strFullOutput="";
316   my $count=0;
317   my $readRecord = 0;
318   
319   my $output = $self->{'YAZOUT'};
320   while (my $strLine = <$output>)
321   {
322       if ($strLine =~ m/Records: ([\d]*)/i ){
323       $readRecord = 1;
324       next; 
325       } elsif ($strLine =~ m/<record/i){ # XML output such as from Library of Congress
326       $strFullOutput .= $strLine;
327       $readRecord = 1;
328       next;
329       }
330     
331       if ($strLine =~ m/\<\/record\>/i) { # end of XML, include the closing tag and then terminate below
332       $strFullOutput .= $strLine;
333       }
334
335      return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected|\<\/record\>/i);
336       
337      next if(!$readRecord);
338     
339      $strFullOutput .= $strLine;     
340  }
341   
342}
343
344sub run_command_without_output
345{
346     my ($self) = shift (@_);
347    my ($strCMD) = @_;
348
349    $self->run_command($strCMD);
350}
351
352sub run_command
353{
354    my ($self,$strCMD) = @_;
355 
356    my $input = $self->{'YAZIN'};
357
358    print $input "$strCMD\n"; 
359}
360
361sub get_output{
362    my ($self,$strStopRE) = @_; 
363
364    if (!defined $strStopRE){return "";}
365    else
366    {
367    my $strFullOutput;
368        my $output = $self->{'YAZOUT'};
369    while (my $strLine = <$output>)
370    {
371           $strFullOutput .= $strLine;   
372       if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;}
373    }
374    }
375}
376
377sub generateFileName
378{
379    my ($self,$intMaxRecords) = @_;
380    my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords);
381 
382}
383
384sub dirFileSplit
385{
386    my ($self,$strFile) = @_;
387
388    my @aryDirs = split(/[\/\\]/,$strFile);
389   
390    my $strLocalFile = pop(@aryDirs);
391    my $strSubDirs = join("/",@aryDirs);
392
393    return ($strSubDirs,$strLocalFile);
394}
395
396sub url_information
397{
398   my ($self,$url) = @_;
399
400   $url = $self->{'url'} unless defined $url;
401
402   my $strOpen = $self->start_yaz($url);
403
404   $strOpen = $self->run_command_with_output("open $url","^Options|Connecting...OK."); 
405
406   $strOpen =~ s/Z> //g;
407   $strOpen =~ s/Elapsed:.*//g;
408
409   print STDERR $strOpen;
410
411   $self->quit_yaz();
412
413   return 0;
414
415}
416
417sub error
418{
419    my ($self,$strFunctionName,$strError) = @_;
420    {
421    print STDERR "Error occoured in Z3950Download.pm\n".
422        "In Function:".$strFunctionName."\n".
423        "Error Message:".$strError."\n";
424    exit(-1);
425    }
426}
427
4281;
Note: See TracBrowser for help on using the browser.