Changeset 12465
- Timestamp:
- 2006-08-18T09:24:29+12:00 (18 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/download.pm
r11784 r12465 36 36 sub load_download { 37 37 my ($download_name,$download_options) = @_; 38 38 39 my ($download_obj); 39 40 40 my $coldownloadname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 41 my $coldownloadname =""; 42 43 if ($ENV{'GSDLCOLLECTDIR'}){ 44 45 $coldownloadname = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, 41 46 "perllib/downloaders", 42 47 "${download_name}.pm"); 48 49 } 50 43 51 my $maindownloadname = &util::filename_cat($ENV{'GSDLHOME'}, 44 52 "perllib/downloaders", 45 53 "${download_name}.pm"); 54 46 55 if (-e $coldownloadname) { require $coldownloadname;} 47 56 elsif (-e $maindownloadname ) { require $maindownloadname; } -
trunk/gsdl/perllib/downloaders/OAIDownload.pm
r11783 r12465 37 37 use XMLParser; 38 38 39 use IO::File;40 39 use POSIX qw(tmpnam); 40 use util; 41 41 42 42 sub BEGIN { … … 56 56 'reqd' => "no"}, 57 57 { 'name' => "get_doc", 58 'disp' => "{OAIDownload. qet_doc_disp}",58 'disp' => "{OAIDownload.get_doc_disp}", 59 59 'desc' => "{OAIDownload.get_doc}", 60 60 'type' => "flag", … … 76 76 my $self; 77 77 78 my $strWgetOptions=""; 79 78 80 sub new 79 81 { … … 107 109 my ($hashGeneralOptions) = @_; 108 110 109 # Checking if the wget has been well setup 110 # &WgetDownload::checkWgetSetup($self,$hashGeneralOptions->{'gli_call'}); 111 112 my $strOutputDir = $hashGeneralOptions->{"cache_dir"}; 111 print STDERR "here2"; 112 113 $strWgetOptions = $self->getWgetOptions(); 114 my $cmdWget = $strWgetOptions; 115 116 my $strOutputDir =""; 117 $strOutputDir = $hashGeneralOptions->{"cache_dir"}; 113 118 my $strBasURL = $self->{'url'}; 114 119 my $intMaxRecords = $self->{'max_records'}; … … 116 121 117 122 print STDERR "<<Defined Maximum>>\n"; 118 my $strIDs = &getOAIIDs($self,$strBasURL); 119 if($strIDs eq "") 123 124 my $strIDs = $self->getOAIIDs($strBasURL); 125 126 if($strIDs eq "") 120 127 { 121 128 print STDERR "Error: No ID being found\n"; 122 129 return 0; 123 130 } 124 my $aryIDs = &parseOAIIDs($strIDs);131 my $aryIDs = $self->parseOAIIDs($strIDs); 125 132 my $intIDs = 0; 126 133 if($self->{'max_records'} < scalar(@$aryIDs)) … … 134 141 print STDERR "<<Total number of record(s):$intIDs>>\n"; 135 142 136 &getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc); 143 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc); 144 145 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp"; 146 &util::rm($tmp_file); 137 147 138 148 return 1; … … 143 153 my ($self,$strBasURL) = @_; 144 154 my ($cmdWget); 155 156 my $wgetOptions = $self->getWgetOptions(); 157 158 $cmdWget = $wgetOptions; 159 145 160 print STDERR "Gathering OAI identifiers.....\n"; 161 146 162 if($self->{'set'} ne "") 147 163 { 148 $cmdWget = "-q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" ";164 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc&set=$self->{'set'}\" "; 149 165 } 150 166 else 151 167 { 152 $cmdWget = "-q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" "; 153 } 154 my $strIDs = &WgetDownload::useWget($cmdWget); 168 $cmdWget .= " -q -O - \"$strBasURL?verb=ListIdentifiers&metadataPrefix=oai_dc\" "; 169 } 170 171 172 my $strIDs = $self->useWget($cmdWget); 173 174 if (!defined $strIDs or $strIDs eq "" ){ 175 print STDERR "Server information is unavailable.\n"; 176 print STDERR "<<Finished>>\n"; 177 return; 178 } 179 180 print STDERR "<<Download Information>>\n"; 181 182 $self->parse_xml($strIDs); 183 155 184 return $strIDs; 156 185 } … … 158 187 sub parseOAIIDs 159 188 { 160 my ($s trIDs) = @_;189 my ($self,$strIDs) = @_; 161 190 162 191 print STDERR "Parsing OAI identifiers.....\n"; … … 177 206 sub dirFileSplit 178 207 { 179 my ($strFile) = @_; 180 181 my @aryDirs = split("/",$strFile); 208 my ($self,$strFile) = @_; 209 210 my @aryDirs = split("[/\]",$strFile); 211 182 212 my $strLocalFile = pop(@aryDirs); 183 213 my $strSubDirs = join("/",@aryDirs); … … 188 218 sub getOAIDoc 189 219 { 190 my ($s trRecord, $strSubDirPath) = @_;191 220 my ($self,$strRecord, $strSubDirPath) = @_; 221 192 222 print STDERR "Gathering source documents.....\n"; 193 223 # look out for identifier tag in metadata section 224 194 225 if ($strRecord =~ m/<metadata>(.*)<\/metadata>/s) 195 226 { … … 200 231 my $strDocURL = $2; 201 232 202 my ($unused,$strDocFile) = dirFileSplit($strDocURL); 203 204 my $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs"); 233 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL); 234 235 my $strSoureDirPath =""; 236 237 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs"); 238 205 239 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath"); 206 240 207 241 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile); 208 242 209 my $wget_cmd = "-q -O $strFullDocFilePath \"$strDocURL\"";210 211 my $strResponse = &WgetDownload::useWget($wget_cmd,1);243 my $wget_cmd = $strWgetOptions." -q -O $strFullDocFilePath \"$strDocURL\""; 244 245 my $strResponse = $self->useWget($wget_cmd,1); 212 246 213 247 if($strResponse ne "") … … 233 267 sub getOAIRecords 234 268 { 235 my ($ aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_;269 my ($self,$aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc) = @_; 236 270 237 271 my $intDocCounter = 0; … … 240 274 { 241 275 print STDERR "Gathering OAI record with ID:$strID.....\n"; 242 # wget it; 243 my $cmdWget= "-q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\""; 244 my $strRecord = &WgetDownload::useWget($cmdWget); 276 277 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=oai_dc&identifier=$strID\""; 278 279 my $strRecord = $self->useWget($cmdWget); 280 281 282 my @fileDirs = split(":",$strID); 245 283 246 284 # setup directories 247 my $strFileURL = "$strOutputDir/$strID.oai"; 248 $strFileURL =~ s/:/\//g; 249 285 286 $strOutputDir =~ s/"//g; 287 288 my $strFileURL = "$strOutputDir/$fileDirs[0]/$fileDirs[1].oai"; 289 250 290 # prepare subdirectory for record (if needed) 251 my ($strSubDirPath,$unused) = dirFileSplit($strFileURL); 291 my ($strSubDirPath,$unused) = ("", ""); 292 293 ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL); 294 252 295 &util::mk_all_dir($strSubDirPath); 253 296 254 297 my $ds = &util::get_dirsep(); 255 my $strOutputFile = &util::filename_cat($strOutputDir,"$strID.oai"); 256 $strOutputFile =~ s/:/$ds/g; 257 298 258 299 if($blnDownloadDoc) 259 300 { 260 &getOAIDoc($strRecord,$strSubDirPath);301 $self->getOAIDoc($strRecord,$strSubDirPath); 261 302 } 262 303 263 304 # save record 264 open (OAIOUT,">$str OutputFile")305 open (OAIOUT,">$strFileURL") 265 306 || die "Unable to save oai metadata record: $!\n"; 266 307 print OAIOUT $strRecord; 267 308 close(OAIOUT); 268 309 269 $intDocCounter ++; 270 print STDERR "<<Done>>\n"; 310 print STDERR "Saving records to $strFileURL\n"; 311 print STDERR "<<Done>>\n"; 312 $intDocCounter ++; 271 313 last if ($intDocCounter >= $intMaxRecords); 272 314 } 315 273 316 ($intDocCounter >= $intMaxRecords) ? 274 317 print STDERR "Reach maximum download records, use -max_records to set the maximum.\n": 275 318 print STDERR "Complete download meta record from $strBasURL\n"; 276 319 320 print STDERR "<<Finished>>\n"; 277 321 } 278 322 … … 282 326 if(!defined $self){ die "System Error: No \$self defined for url_information in OAIDownload\n";} 283 327 284 my $strBaseCMD = "-q -O - \"$self->{'url'}?_OPTS_\""; 328 my $wgetOptions = $self->getWgetOptions(); 329 my $strBaseCMD = $wgetOptions." -q -O - \"$self->{'url'}?_OPTS_\""; 285 330 286 331 my $strIdentify = "verb=Identify"; … … 290 335 $strIdentifyCMD =~ s/_OPTS_/$strIdentify/; 291 336 292 my $strIdentifyText = &WgetDownload::useWget($strIdentifyCMD); 337 my $strIdentifyText = $self->useWget($strIdentifyCMD); 338 339 if (!defined $strIdentifyText or $strIdentifyText eq "" ){ 340 print STDERR "Server information is unavailable.\n"; 341 print STDERR "<<Finished>>\n"; 342 return; 343 } 293 344 294 345 print STDERR "General information:\n"; … … 297 348 my $strListSetCMD = $strBaseCMD; 298 349 $strListSetCMD =~ s/_OPTS_/$strListSets/; 299 my $strListSetsText = &WgetDownload::useWget($strListSetCMD); 350 my $strListSetsText = $self->useWget($strListSetCMD); 351 352 300 353 print STDERR "List Information:\n"; 301 354 $self->parse_xml($strListSetsText); … … 306 359 my ($self) = shift (@_); 307 360 my ($strOutputText) = @_; 308 my ($name,$fh); 309 361 310 362 #Open a temporary file to store OAI information, and store the information to the temp file 311 do {$name = tmpnam()} 312 until $fh = IO::File->new($name, O_RDWR|O_CREAT|O_EXCL); 313 print $fh $strOutputText; 314 close($fh); 363 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp"; 364 365 open(*OAIOUT,"> $name"); 366 367 print OAIOUT $strOutputText; 368 close(OAIOUT); 315 369 316 370 $self->{'temp_file_name'} = $name; … … 340 394 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) { 341 395 $self->{'text'} .= $_[1]; 342 $self->{'text'} =~ s/[\n]| [" "]//g;396 $self->{'text'} =~ s/[\n]|([ ]{2,})//g; 343 397 if($self->{'text'} ne "") 344 398 { … … 352 406 { 353 407 my ($expat, $element, %attr) = @_; 408 354 409 $self->{'subfield'} = $element; 410 355 411 } 356 412 … … 364 420 sub error 365 421 { 366 my ($s trFunctionName,$strError) = @_;422 my ($self,$strFunctionName,$strError) = @_; 367 423 { 368 424 print "Error occoured in OAIDownload.pm\n". -
trunk/gsdl/perllib/downloaders/SRWDownload.pm
r11783 r12465 34 34 use strict; 35 35 36 use BasDownload;36 use Z3950Download; 37 37 use IPC::Open2; 38 38 39 39 sub BEGIN { 40 @SRWDownload::ISA = ('BasDownload'); 41 } 42 43 local (*YAZOUT, *YAZIN); 44 45 # args same as Z3950Download at the moment - should it be based on that?? 46 my $arguments = 47 [ { 'name' => "host", 48 'disp' => "{Z3950Download.host_disp}", 49 'desc' => "{Z3950Download.host}", 50 'type' => "string", 51 'reqd' => "yes"}, 52 { 'name' => "port", 53 'disp' => "{Z3950Download.port_disp}", 54 'desc' => "{Z3950Download.port}", 55 'type' => "string", 56 'reqd' => "yes"}, 57 { 'name' => "database", 58 'disp' => "{Z3950Download.database_disp}", 59 'desc' => "{Z3950Download.database}", 60 'type' => "string", 61 'reqd' => "yes"}, 62 { 'name' => "find", 63 'disp' => "{Z3950Download.find_disp}", 64 'desc' => "{Z3950Download.find}", 65 'type' => "string", 66 'deft' => "", 67 'reqd' => "yes"}, 68 { 'name' => "max_records", 69 'disp' => "{Z3950Download.max_records_disp}", 70 'desc' => "{Z3950Download.max_records}", 71 'type' => "int", 72 'deft' => "500", 73 'reqd' => "no"}]; 40 @SRWDownload::ISA = ('Z3950Download'); 41 } 42 43 my $arguments; 74 44 75 45 my $options = { 'name' => "SRWDownload", 76 46 'desc' => "{SRWDownload.desc}", 77 47 'abstract' => "no", 78 'inherits' => "yes" ,79 'args' => $arguments};48 'inherits' => "yes" 49 }; 80 50 81 51 … … 89 59 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 90 60 91 my $self = (defined $hashArgOptLists)? new BasDownload($getlist,$inputargs,$hashArgOptLists): new BasDownload($getlist,$inputargs);61 my $self = (defined $hashArgOptLists)? new Z3950Download($getlist,$inputargs,$hashArgOptLists): new Z3950Download($getlist,$inputargs); 92 62 93 63 if ($self->{'info_only'}) { … … 106 76 my ($hashGeneralOptions) = @_; 107 77 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords); 78 79 my $url = $self->{'url'}; 80 108 81 print STDERR "<<Defined Maximum>>\n"; 109 82 110 my $url = $self->{'url'}; 111 112 open2(*YAZOUT, *YAZIN, "yaz-client $url") 113 or die "can't open pipe to yaz-client: $!"; 83 my $yaz = $self->{'yaz'}; 84 85 my $childpid = open2(*YAZOUT, *YAZIN, $yaz) 86 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!"); 87 88 $self->{'YAZOUT'} = *YAZOUT; 89 $self->{'YAZIN'} = *YAZIN; 90 91 $strOpen = $self->open_connection("open $url"); 92 93 if (!$strOpen) { 94 print STDERR "Cannot connect to $url\n"; 95 print STDERR "<<Finished>>\n"; 96 return 0; 97 } 114 98 115 99 print STDERR "Opening connection to \"$self->{'url'}\"\n"; 116 #$strOpen = &run_command_with_output("open $self->{'url'}");117 100 print STDERR "Access database: \"$self->{'database'}\"\n"; 118 &run_command_without_output("base $self->{'database'}");119 &run_command_without_output("querytype prefix");101 $self->run_command_without_output("base $self->{'database'}"); 102 $self->run_command_without_output("querytype prefix"); 120 103 print STDERR "Searching for keyword: \"$self->{'find'}\"\n"; 121 $intAmount = &findAmount($self->{'find'});104 $intAmount =$self->findAmount($self->{'find'}); 122 105 123 106 if($intAmount <= 0) 124 107 { 125 108 ($intAmount == -1)? 126 print STDERR " Unexpected format, Parsing operationcan not be performed\n" :109 print STDERR "Something wrong with the arguments,downloading can not be performed\n" : 127 110 print STDERR "No Record is found\n"; 111 print STDERR "<<Finished>>\n"; 128 112 return 0; 129 113 } 130 114 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'}; 131 115 print STDERR "<<Total number of record(s):$intMaxRecords>>\n"; 132 $strRecords = &getRecords($intMaxRecords); 133 print STDERR $strRecords; 134 &saveRecords($self,$strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords); 135 print STDERR "Closing connection\n"; 116 117 $strRecords = $self->getRecords($intMaxRecords); 118 119 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords); 120 print STDERR "Closing connection...\n"; 121 print STDERR "<<Finished>>\n"; 136 122 close(YAZOUT); 137 123 close(YAZIN); 138 return 1; 139 } 140 141 sub findAmount 142 { 143 my($strFindTarget) = @_; 144 my $strResponse = &run_command_with_output("find $strFindTarget"); 145 146 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1; 147 } 148 149 sub getRecords 150 { 151 my ($intMaxRecords) = @_; 152 my ($strShow,$intStartNumber,$strResponse,$strRecords,$intRecordsLeft); 153 154 $intStartNumber = 1; 155 $intRecordsLeft = $intMaxRecords; 156 while ($intRecordsLeft > 0) 157 { 158 if($intRecordsLeft > 50) 159 { 160 print STDERR "<<Done:50>>\n"; 161 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n"; 162 163 $strShow = "show $intStartNumber+50"; 164 $intStartNumber = $intStartNumber + 50; 165 $intRecordsLeft = $intRecordsLeft - 50; 166 } 167 else 168 { 169 print STDERR "<<Done:".($intRecordsLeft).">>\n"; 170 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n"; 171 $strShow = "show $intStartNumber+$intRecordsLeft"; 172 $intRecordsLeft = 0; 173 } 174 175 $strResponse = &run_command_with_output($strShow); 176 177 ## need to change this 178 179 print STDERR $strResponse; 180 181 if($strResponse =~ m/pos=[\d]*(.*)>\n/s) 182 { 183 $strRecords .= "$1>\n"; 184 185 $strRecords =~ s/pos=[\d]*(.*)?\n//g; 186 } 187 } 188 return $strRecords; 189 } 124 return 1; 125 } 126 190 127 191 128 sub saveRecords … … 195 132 # setup directories 196 133 # Currently only gather the MARC format 197 my $strFileName = &generateFileName($self,$intMaxRecords); 198 my $strFileURL = "$strOutputDir/$self->{'host'}/$strFileName.marc"; 199 $strFileURL =~ s/:/\//g; 200 134 $strRecords ="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<collection>$strRecords</collection>"; 135 my $strFileName = $self->generateFileName($intMaxRecords); 136 my $host = $self->{'host'}; 137 $host =~ s/http:\/\//srw\//; 138 $strOutputDir =~ s/"//g; 139 my $strFileURL = "$strOutputDir/$host/$strFileName.xml"; 140 201 141 # prepare subdirectory for record (if needed) 202 my ($strSubDirPath,$unused) = dirFileSplit($strFileURL); 142 143 my ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL); 203 144 &util::mk_all_dir($strSubDirPath); 204 145 205 146 my $ds = &util::get_dirsep(); 206 my $strOutputFile = &util::filename_cat($strOutputDir,$self->{'host'},"$strFileName.marc"); 207 $strOutputFile =~ s/:/$ds/g; 208 209 print STDERR "Saving records to \"$strOutputFile\"\n"; 147 148 print STDERR "Saving records to \"$strFileURL\"\n"; 210 149 211 150 # save record 212 open (ZOUT,">$str OutputFile")151 open (ZOUT,">$strFileURL") 213 152 || die "Unable to save oai metadata record: $!\n"; 214 153 print ZOUT $strRecords; … … 216 155 } 217 156 218 sub run_command_with_output 219 { 220 my ($strCMD) = @_; 221 return &run_command($strCMD,"^Elapsed:.*\$"); 222 } 223 224 sub run_command_without_output 225 { 226 my ($strCMD) = @_; 227 &run_command($strCMD); 228 } 229 230 sub run_command 231 { 232 my ($strCMD,$strStopRE) = @_; 157 sub get{ 158 my ($self,$strShow,$numRecord) = @_; 159 160 $self->run_command($strShow); 161 162 my $strFullOutput=""; 163 my $count=0; 164 my $readRecord = 0; 165 166 while (my $strLine = <YAZOUT>) 167 { 233 168 234 print YAZIN "$strCMD\n"; 235 if (!defined $strStopRE){return "";} 236 else 237 { 238 my $strFullOutput; 239 while (my $strLine = <YAZOUT>) 240 { 241 $strFullOutput .= $strLine; 242 if($strLine =~ m/$strStopRE/){return $strFullOutput;} 243 } 244 } 245 } 246 247 sub url_information 248 { 249 my ($self) = shift (@_); 169 return $strFullOutput if ($count >= $numRecord); 170 171 return $strFullOutput if($strLine =~ m/^HTTP ERROR/i); 172 173 if ($strLine =~ m/pos=[\d]*/i ){ 174 $count++; 175 $readRecord = 1; 176 next; 177 } 178 179 next if(!$readRecord); 180 181 $strFullOutput .= $strLine; 182 } 183 184 } 185 186 sub url_information{ 187 my ($self) = @_; 250 188 251 189 my $url = $self->{'url'}; 252 190 253 open2(*YAZOUT, *YAZIN, "yaz-client $url") or die "can't open pipe to yaz-client: $!"; 254 255 my $strFullOutput=""; 256 257 while (my $strLine = <YAZOUT>) 258 { 259 $strFullOutput .= $strLine; 260 } 261 262 return $strFullOutput; 263 } 264 265 sub generateFileName 266 { 267 my ($self,$intMaxRecords) = @_; 268 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords); 269 } 270 271 sub dirFileSplit 272 { 273 my ($strFile) = @_; 274 275 my @aryDirs = split("/",$strFile); 276 my $strLocalFile = pop(@aryDirs); 277 my $strSubDirs = join("/",@aryDirs); 278 279 return ($strSubDirs,$strLocalFile); 191 $url =~ s#http://##; 192 193 return $self->SUPER::url_information($url); 194 280 195 } 281 196 282 197 sub error 283 198 { 284 my ($s trFunctionName,$strError) = @_;199 my ($self, $strFunctionName,$strError) = @_; 285 200 { 286 201 print STDERR "Error occoured in SRWDownload.pm\n". -
trunk/gsdl/perllib/downloaders/WebDownload.pm
r11783 r12465 99 99 my ($hashGeneralOptions) = @_; 100 100 101 # TODO: the checking for Wget is still not complete, we need to 102 # check if the proxy has been set or not, and whether the 103 # connection has been established. 104 # Checking if the wget has been well setup 105 # &WgetDownload::checkWgetSetup($self,$hashGeneralOptions->{'gli_call'}); 106 101 107 102 # Download options 108 my $strOptions = &generateOptionsString($self); 109 my $strWgetOptions = &WgetDownload::getWgetOptions($self); 110 111 103 my $strOptions = $self->generateOptionsString(); 104 my $strWgetOptions = $self->getWgetOptions(); 105 112 106 # Setup the command for using wget 113 107 my $cmdWget = "-N -k -x -t 2 -P ".$hashGeneralOptions->{"cache_dir"}." $strWgetOptions $strOptions ".$self->{'url'}; 114 108 115 print "**************".$cmdWget."\n";116 117 109 # Download the web pages 118 110 # print "Strat download from $self->{'url'}...\n"; 119 120 111 print STDERR "<<Undefined Maximum>>\n"; 121 my $strResponse = &WgetDownload::useWget($cmdWget,1); 122 #if ($strResponse ne ""){print "$strResponse\n";} 123 124 # print "Finish download from $self->{'url'}...\n"; 125 112 113 my $strResponse = $self->useWget($cmdWget,1); 114 115 # if ($strResponse ne ""){print "$strResponse\n";} 116 117 print STDERR "Finish download from $self->{'url'}\n"; 118 119 print STDERR "<<Finished>>\n"; 120 126 121 return 1; 127 122 } … … 146 141 else 147 142 { 148 &error("setupOptions","Incorrect Depth is defined!!\n");143 $self->error("setupOptions","Incorrect Depth is defined!!\n"); 149 144 } 150 145 … … 174 169 { 175 170 my ($self) = shift (@_); 176 if(!defined $self){ die "System Error: No \$self defined for url_information in WebDownload\n";} 177 178 my $strBaseCMD = "-q -O - \"$self->{'url'}\""; 179 180 my $strIdentifyText = &WgetDownload::useWget($strBaseCMD); 181 171 172 my $strOptions = $self->getWgetOptions(); 173 174 my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\""; 175 176 177 my $strIdentifyText = $self->useWget($strBaseCMD); 178 179 if (!defined $strIdentifyText or $strIdentifyText eq "" ){ 180 print STDERR "Server information is unavailable.\n"; 181 print STDERR "<<Finished>>\n"; 182 return; 183 } 184 182 185 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s) 183 186 { … … 185 188 print STDERR "Page Title: $2\n"; 186 189 } 187 190 188 191 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s) 189 192 { 190 193 $strIdentifyText = $1.$3; 191 194 my $strTempString = $2; 192 print STDERR "Meta Information:\n"; 195 print STDERR "\n"; 196 193 197 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s) 194 198 { … … 202 206 $strMetaName =~ s/^([" "])+//m; 203 207 $strMetaContain =~ s/^([" "])+//m; 204 print STDERR "\t$strMetaName: $strMetaContain\n"; 205 208 209 print STDERR "$strMetaName: $strMetaContain\n\n"; 210 206 211 } 207 print STDERR "\n"; 208 } 212 213 } 214 215 print STDERR "<<Finished>>\n"; 216 209 217 } 210 218 … … 222 230 223 231 1; 232 -
trunk/gsdl/perllib/downloaders/WgetDownload.pm
r11783 r12465 34 34 use BasDownload; 35 35 use strict; 36 use IPC::Open2; 36 37 37 38 sub BEGIN { … … 40 41 41 42 my $arguments = 42 # [ { 'name' => "url",43 # 'desc' => "{WgetDownload.url}",44 # 'type' => "string",45 # 'deft' => "",46 # 'reqd' => "yes"},47 43 [ { 'name' => "proxy_on", 48 44 'desc' => "{WgetDownload.proxy_on}", … … 125 121 # Setup .wgetrc by using $self->{'proxy_host'} and $self->{'proxy_port'} 126 122 # Test if the connection is succeful. If the connection wasn't succeful then ask user to supply username and password. 127 128 123 129 # TODO: How to test run if the proxy setup is working correctly??130 # Use -spider to test whether the connection is working correctly.131 # TODO: Ask user to supply username and password.132 # Try to use the .wgetrc to setup the user name and password133 134 124 } 135 125 136 126 sub useWget 137 127 { 138 my ($ cmdWget,$blnShow) = @_;128 my ($self, $cmdWget,$blnShow) = @_; 139 129 140 my $strReadIn = ""; 141 my $strLine; 130 my ($os,$strReadIn,$strLine,$command); 131 132 $os = $ENV{'GSDLOS'}; 133 134 135 if ($os =~ /windows/i){ 136 $command = "\"$ENV{'GSDLHOME'}\\bin\\windows\\wget\" $cmdWget |"; 137 } 138 else{ 139 $command = "$ENV{'GSDLHOME'}/packages/wget/wget-1.9/src/wget $cmdWget |"; 140 } 142 141 143 open (WIN,"$ENV{'GSDLHOME'}/packages/wget/wget-1.9/src/wget $cmdWget|") || die "wget request failed: $!\n"; 142 143 144 open(*WIN,$command) || die "wget request failed: $!\n"; 145 146 147 144 148 while (defined($strLine=<WIN>)) 145 149 { 150 151 146 152 if($blnShow) 147 153 { 148 print "$strReadIn\n";154 print STDERR "$strReadIn\n"; 149 155 } 156 150 157 $strReadIn .= $strLine; 151 158 } 159 152 160 close(WIN); 153 161 -
trunk/gsdl/perllib/downloaders/Z3950Download.pm
r11783 r12465 40 40 @Z3950Download::ISA = ('BasDownload'); 41 41 } 42 43 local (*YAZOUT, *YAZIN);44 42 45 43 my $arguments = … … 97 95 # Must set $self->{'url'}, since GLI use $self->{'url'} to calculate the log file name! 98 96 $self->{'url'} = $self->{'host'}.":".$self->{'port'}; 97 98 my $os = $ENV{'GSDLOS'}; 99 100 if ($os !~ /windows/) { 101 $self->{'yaz'} = "$ENV{'GSDLHOME'}/packages/yaz/yaz-2.1.4/client/yaz-client"; 102 } 103 else{ 104 $self->{'yaz'} = "$ENV{'GSDLHOME'}/bin/windows/yaz-client"; 105 } 106 99 107 return bless $self, $class; 100 108 … … 106 114 my ($hashGeneralOptions) = @_; 107 115 my ($strOpen,$strBase,$strFind,$strResponse,$intAmount,$intMaxRecords,$strRecords); 116 117 my $url = $self->{'url'}; 118 108 119 print STDERR "<<Defined Maximum>>\n"; 109 120 110 my $url = $self->{'url'};111 121 print STDERR "Opening connection to $url\n"; 112 122 113 my $childpid = open2(*YAZOUT, *YAZIN, "yaz-client") 114 or die "can't open pipe to yaz-client: $!"; 115 116 $strOpen = &run_command_with_output("open $url"); 123 my $yaz = $self->{'yaz'}; 124 125 my $childpid = open2(*YAZOUT, *YAZIN, $yaz) 126 or (print STDERR "<<Finished>>\n" and die "can't open pipe to yaz-client: $!"); 127 $self->{'YAZOUT'} = *YAZOUT; 128 $self->{'YAZIN'} = *YAZIN; 129 130 $strOpen = $self->open_connection("open $url"); 131 132 if (!$strOpen) { 133 print STDERR "Cannot connect to $url\n"; 134 print STDERR "<<Finished>>\n"; 135 return 0; 136 } 137 117 138 print STDERR "Access database: \"$self->{'database'}\"\n"; 118 &run_command_without_output("base $self->{'database'}");139 $self->run_command_without_output("base $self->{'database'}"); 119 140 print STDERR "Searching for keyword: \"$self->{'find'}\"\n"; 120 $intAmount = &findAmount($self->{'find'});141 $intAmount = $self->findAmount($self->{'find'}); 121 142 122 143 if($intAmount <= 0) 123 144 { 124 145 ($intAmount == -1)? 125 print STDERR "Unexpected format, Parsing operation can not be performed\n" : 126 print STDERR "No Record is found\n"; 146 print STDERR "Something wrong with the arguments,downloading can not be performed\n": 147 print STDERR "No Record is found\n"; 148 print STDERR "<<Finished>>\n"; 127 149 return 0; 128 150 } 129 151 $intMaxRecords = ($self->{'max_records'} > $intAmount)? $intAmount : $self->{'max_records'}; 130 152 print STDERR "<<Total number of record(s):$intMaxRecords>>\n"; 131 $strRecords = &getRecords($intMaxRecords);132 print STDERR $strRecords;133 &saveRecords($self,$strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords);153 $strRecords = "Records: $intMaxRecords\n".$self->getRecords($intMaxRecords); 154 155 $self->saveRecords($strRecords,$hashGeneralOptions->{'cache_dir'},$intMaxRecords); 134 156 print STDERR "Closing connection...\n"; 157 print STDERR "<<Finished>>\n"; 158 135 159 close(YAZOUT); 136 160 close(YAZIN); 137 waitpid($childpid, 0);138 161 return 1; 139 162 } 140 163 164 sub open_connection{ 165 my ($self,$strCommand) = (@_); 166 167 $self->run_command($strCommand); 168 169 my $out = $self->{'YAZOUT'}; 170 171 $_ = <$out>; 172 173 return (/Connecting...OK/i)? 1: 0; 174 175 } 176 141 177 sub findAmount 142 178 { 179 my ($self) = shift (@_); 143 180 my($strFindTarget) = @_; 144 my $strResponse = &run_command_with_output("find $strFindTarget");145 181 my $strResponse = $self->run_command_with_output("find $strFindTarget","^Number of hits:"); 182 return ($strResponse =~ m/^Number of hits: (\d+)/m)? $1:-1; 146 183 } 147 184 148 185 sub getRecords 149 186 { 187 my ($self) = shift (@_); 150 188 my ($intMaxRecords) = @_; 151 my ($strShow,$intStartNumber,$ strResponse,$strRecords,$intRecordsLeft);189 my ($strShow,$intStartNumber,$numRecords,$strResponse,$strRecords,$intRecordsLeft); 152 190 153 191 $intStartNumber = 1; 154 192 $intRecordsLeft = $intMaxRecords; 193 $numRecords = 0; 194 $strResponse =""; 195 155 196 while ($intRecordsLeft > 0) 156 197 { 157 198 if($intRecordsLeft > 50) 158 199 { 159 print STDERR "<<Done:50>>\n";200 160 201 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+49)."\n"; 161 202 $numRecords = 50; 162 203 $strShow = "show $intStartNumber+50"; 163 204 $intStartNumber = $intStartNumber + 50; 164 205 $intRecordsLeft = $intRecordsLeft - 50; 206 165 207 } 166 208 else 167 209 { 168 print STDERR "<<Done:".($intRecordsLeft).">>\n";210 $numRecords = $intRecordsLeft; 169 211 print STDERR "Yaz is Gathering records: $intStartNumber - ".($intStartNumber+$intRecordsLeft-1)."\n"; 170 212 $strShow = "show $intStartNumber+$intRecordsLeft"; 171 213 $intRecordsLeft = 0; 214 215 } 216 217 $strResponse .= $self->get($strShow,$numRecords); 218 219 if ($strResponse eq ""){ 220 print STDERR "<<ERROR: failed to get $numRecords records>>\n"; 172 221 } 222 else{ 223 print STDERR "<<Done:$numRecords>>\n"; 224 } 225 } 226 227 return "$strResponse\n"; 173 228 174 $strResponse = &run_command_with_output($strShow);175 176 if($strResponse =~ m/Records: (\d*?)\n(.*?)nextResultSetPosition = (\d*?)\n/s)177 {178 $strRecords .= $2;179 }180 }181 return $strRecords;182 229 } 183 230 … … 188 235 # setup directories 189 236 # Currently only gather the MARC format 190 my $strFileName = &generateFileName($self,$intMaxRecords); 237 my $strFileName = $self->generateFileName($intMaxRecords); 238 239 $strOutputDir =~ s/"//g; 240 191 241 my $strFileURL = "$strOutputDir/$self->{'host'}/$strFileName.marc"; 192 $strFileURL =~ s/:/\//g; 193 242 194 243 # prepare subdirectory for record (if needed) 195 my ($strSubDirPath,$unused) = dirFileSplit($strFileURL); 244 my ($strSubDirPath,$unused) = $self->dirFileSplit($strFileURL); 245 196 246 &util::mk_all_dir($strSubDirPath); 197 247 198 248 my $ds = &util::get_dirsep(); 199 249 my $strOutputFile = &util::filename_cat($strOutputDir,$self->{'host'},"$strFileName.marc"); 200 $strOutputFile =~ s/:/$ds/g; 201 250 202 251 print STDERR "Saving records to \"$strOutputFile\"\n"; 203 252 204 253 # save record 205 254 open (ZOUT,">$strOutputFile") 206 || die "Unable to save oai metadatarecord: $!\n";255 || die "Unable to save Z3950 record: $!\n"; 207 256 print ZOUT $strRecords; 208 257 close(ZOUT); 209 258 } 210 259 260 211 261 sub run_command_with_output 212 262 { 263 my ($self,$strCMD,$strStopRE) =@_; 264 265 $self->run_command($strCMD); 266 267 return $self->get_output($strStopRE); 268 269 } 270 271 sub get{ 272 my ($self,$strShow,$numRecord) = @_; 273 274 $self->run_command($strShow); 275 276 my $strFullOutput=""; 277 my $count=0; 278 my $readRecord = 0; 279 280 while (my $strLine = <YAZOUT>) 281 { 282 283 if ($strLine =~ m/Records: ([\d]*)/i ){ 284 $readRecord = 1; 285 next; 286 } 287 288 return $strFullOutput if ($strLine =~ m/nextResultSetPosition|Not connected/i); 289 290 next if(!$readRecord); 291 292 $strFullOutput .= $strLine; 293 } 294 295 } 296 297 sub run_command_without_output 298 { 299 my ($self) = shift (@_); 213 300 my ($strCMD) = @_; 214 215 return &run_command($strCMD,"^Elapsed:.*\$"); 216 } 217 218 sub run_command_without_output 219 { 220 my ($strCMD) = @_; 221 222 &run_command($strCMD); 301 302 $self->run_command($strCMD); 223 303 } 224 304 225 305 sub run_command 226 306 { 227 my ($strCMD,$strStopRE) = @_; 228 229 230 print YAZIN "$strCMD\n"; 307 my ($self,$strCMD) = @_; 308 309 my $input = $self->{'YAZIN'}; 310 311 print $input "$strCMD\n"; 312 } 313 314 sub get_output{ 315 my ($self,$strStopRE) = @_; 316 231 317 if (!defined $strStopRE){return "";} 232 318 else 233 319 { 234 320 my $strFullOutput; 235 while (my $strLine = <YAZOUT>) 321 my $output = $self->{'YAZOUT'}; 322 while (my $strLine = <$output>) 236 323 { 237 $strFullOutput .= $strLine; 238 if($strLine =~ m/$strStopRE/){return $strFullOutput;}324 $strFullOutput .= $strLine; 325 if($strLine =~ m/^$strStopRE|Not connected/i){return $strFullOutput;} 239 326 } 240 327 } … … 245 332 my ($self,$intMaxRecords) = @_; 246 333 my $strFileName = ($self->{'database'})."_".($self->{'find'})."_".($intMaxRecords); 334 247 335 } 248 336 249 337 sub dirFileSplit 250 338 { 251 my ($strFile) = @_; 252 253 my @aryDirs = split("/",$strFile); 339 my ($self,$strFile) = @_; 340 341 my @aryDirs = split("[/\]",$strFile); 342 254 343 my $strLocalFile = pop(@aryDirs); 255 344 my $strSubDirs = join("/",@aryDirs); … … 258 347 } 259 348 349 sub url_information 350 { 351 my ($self,$url) = @_; 352 353 $url = $self->{'url'} unless defined $url; 354 355 my $yaz = $self->{'yaz'}; 356 357 my $childpid = open2(*YAZOUT, *YAZIN, $yaz) 358 or die "can't open pipe to yaz-client: $!"; 359 360 $self->{'YAZOUT'} = *YAZOUT; 361 $self->{'YAZIN'} = *YAZIN; 362 363 my $strOpen = $self->open_connection("open $url"); 364 365 if (!$strOpen) { 366 print STDERR "Cannot connect to $url\n"; 367 print STDERR "<<Finished>>\n"; 368 return 0; 369 } 370 371 372 $strOpen = $self->run_command_with_output("open $url","^Options"); 373 374 375 $strOpen =~ s/Z> //g; 376 $strOpen =~ s/Elapsed:.*//g; 377 378 print STDERR $strOpen; 379 380 print STDERR "<<Finished>>\n"; 381 382 close(YAZOUT); 383 close(YAZIN); 384 385 return 0; 386 387 } 388 260 389 sub error 261 390 { 262 my ($s trFunctionName,$strError) = @_;391 my ($self,$strFunctionName,$strError) = @_; 263 392 { 264 393 print STDERR "Error occoured in Z3950Download.pm\n".
Note:
See TracChangeset
for help on using the changeset viewer.