source: main/trunk/greenstone2/perllib/downloaders/SRWDownload.pm@ 28250

Last change on this file since 28250 was 28250, checked in by ak19, 11 years ago
  1. Jenny and I fixed an oversight in OAIDownload, thanks to Kathy's suggestion, where an html page's contents used to be inspected for file types specified to be downloaded with GetDocument ONLY if the file extension was htm(l). Now the test is whether the header specifies Content-Type text/html. 2. Deprecated utils functions replaced with their FileUtils equivalents.
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1###########################################################################
2#
3# SRWDownload.pm -- base class for all the import plugins
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package SRWDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use strict;
35
36use Z3950Download;
37use IPC::Open2;
38
39sub BEGIN {
40 @SRWDownload::ISA = ('Z3950Download');
41}
42
43my $arguments = [
44 ];
45
46my $options = { 'name' => "SRWDownload",
47 'desc' => "{SRWDownload.desc}",
48 'abstract' => "no",
49 'inherits' => "yes"
50 };
51
52
53sub new
54{
55 my ($class) = shift (@_);
56 my ($getlist,$inputargs,$hashArgOptLists) = @_;
57 push(@$getlist, $class);
58
59 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
60 push(@{$hashArgOptLists->{"OptList"}},$options);
61
62 my $self = new Z3950Download($getlist,$inputargs,$hashArgOptLists);
63
64 if ($self->{'info_only'}) {
65 # don't worry about any options etc
66 return bless $self, $class;
67 }
68
69 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name!
70 $self->{'url'} = $self->{'host'}.":".$self->{'port'};
71 return bless $self, $class;
72}
73
74sub download
75{
76 my ($self) = shift (@_);
77 my ($hashGeneralOptions) = @_;
78 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords);
79
80 # If the url contains just the host and port (as it would for Z39.50), then prepend
81 # the http protocol. Otherwise the download is stuck in an infinite loop for SRW/SRU
82 $self->{'url'} = "http://$self->{'url'}" if $self->{'url'} !~ m/^http/;
83 my $url = $self->{'url'};
84
85 print STDERR "<<Defined Maximum>>\n";
86
87 $strOpen = $self->start_yaz($url);
88
89 print STDERR "Opening connection to \"$self->{'url'}\"\n";
90 print STDERR "Access database: \"$self->{'database'}\"\n";
91 $self->run_command_without_output("base $self->{'database'}");
92 $self->run_command_without_output("querytype prefix");
93 print STDERR "Searching for keyword: \"$self->{'find'}\"\n";
94
95 $intAmount =$self->findAmount($self->{'find'});
96
97 if($intAmount <= 0)
98 {
99 ($intAmount == -1)?
100 print STDERR "Something wrong with the arguments,downloading can not be performed\n" :
101 print STDERR "No Record is found\n";
102 print STDERR "<<Finished>>\n";
103 return 0;
104 }
105 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'};
106 print STDERR "<<Total number of record(s):$intMaxRecords>>\n";
107
108 $strRecords = $self->getRecords($intMaxRecords);
109
110 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);
111 print STDERR "Closing connection...\n";
112
113 $self->quit_yaz();
114 return 1;
115}
116
117
118sub saveRecords
119{
120 my ($self,$strRecords,$strOutputDir,$intMaxRecords) = @_;
121
122 # setup directories
123 # Currently only gather the MARC format
124 $strRecords ="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<collection>$strRecords</collection>";
125 my $strFileName = $self->generateFileName($intMaxRecords);
126 my $host = $self->{'host'};
127 $host =~ s/http:\/\///;
128 $strOutputDir =~ s/"//g; #"
129 my $strOutputFile = &FileUtils::filenameConcatenate($strOutputDir,$host,"$strFileName.xml");
130
131 # prepare subdirectory for record (if needed)
132
133 my ($strSubDirPath,$unused) = $self->dirFileSplit($strOutputFile);
134 &FileUtils::makeAllDirectories($strSubDirPath);
135
136 print STDERR "Saving records to \"$strOutputFile\"\n";
137
138 # save record
139 open (ZOUT,">$strOutputFile")
140 || die "Unable to save oai metadata record: $!\n";
141 print ZOUT $strRecords;
142 close(ZOUT);
143}
144
145sub get {
146 my ($self,$strShow,$numRecord) = @_;
147
148 $self->run_command_without_output($strShow);
149
150 my $strFullOutput="";
151 my $count=0;
152 my $readRecord = 0;
153 my $endRecord = 0;
154
155 my $output = $self->{'YAZOUT'};
156 my $strLine;
157
158 while ($strLine = <$output>) #while (defined ($strLine = <$output>))
159 {
160 last if ($count >= $numRecord && $endRecord); # done, if we've reached the end of the last record
161
162 last if($strLine =~ m/^HTTP ERROR/i);
163
164 if ($strLine =~ m/pos=[\d]*/i ) {
165 $count++;
166 $readRecord = 1;
167 $endRecord = 0;
168 next;
169 }
170
171 if ($strLine =~ m/<\/record>/i ) { # end tag of record
172 $endRecord = 1;
173 }
174
175 next if(!$readRecord);
176
177 $strFullOutput .= $strLine;
178 }
179
180 return $strFullOutput;
181}
182
183sub url_information{
184 my ($self) = @_;
185
186 my $url = $self->{'url'};
187
188 $url =~ s#http://##;
189
190 return $self->SUPER::url_information($url);
191
192}
193
194sub error
195{
196 my ($self, $strFunctionName,$strError) = @_;
197 {
198 print STDERR "Error occoured in SRWDownload.pm\n".
199 "In Function:".$strFunctionName."\n".
200 "Error Message:".$strError."\n";
201 exit(-1);
202 }
203}
204
2051;
Note: See TracBrowser for help on using the repository browser.