Changeset 16791 for gsdl/trunk/perllib/downloaders/OAIDownload.pm
- Timestamp:
- 2008-08-14T16:42:19+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/downloaders/OAIDownload.pm
r16725 r16791 66 66 'type' => "flag", 67 67 'reqd' => "no"}, 68 { 'name' => "get_doc_exts", 69 'disp' => "{OAIDownload.get_doc_exts_disp}", 70 'desc' => "{OAIDownload.get_doc_exts}", 71 'type' => "string", 72 'deft' => "doc,pdf,ppt", 73 'reqd' => "no"}, 68 74 { 'name' => "max_records", 69 75 'disp' => "{OAIDownload.max_records_disp}", … … 80 86 'args' => $arguments }; 81 87 82 my $self;83 84 my $strWgetOptions="";88 ##my $self; 89 90 #### my $strWgetOptions=""; 85 91 86 92 sub new … … 93 99 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; 94 100 95 $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);101 my $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs); 96 102 97 103 if ($self->{'info_only'}) { … … 101 107 102 108 my $parser = new XML::Parser('Style' => 'Stream', 109 'PluginObj' => $self, 103 110 'Handlers' => {'Char' => \&Char, 104 111 'Start' => \&OAI_StartTag, … … 113 120 } 114 121 122 123 # set up hashmap for individual items in get_doc_exts 124 # to make testing for matches easier 125 126 $self->{'lookup_exts'} = {}; 127 my $get_doc_exts = $self->{'get_doc_exts'}; 128 129 if ((defined $get_doc_exts) && ($get_doc_exts ne "")) { 130 my @exts = split(/,\s*/,$get_doc_exts); 131 foreach my $e (@exts) { 132 $self->{'lookup_exts'}->{lc($e)} = 1; 133 } 134 } 135 136 115 137 return bless $self, $class; 116 138 } … … 121 143 my ($hashGeneralOptions) = @_; 122 144 123 ## print STDERR "here2"; 124 125 $strWgetOptions = $self->getWgetOptions(); 126 my $cmdWget = $strWgetOptions; 145 ## my $cmdWget = $strWgetOptions; 127 146 128 147 my $strOutputDir =""; … … 136 155 my $strIDs = $self->getOAIIDs($strBasURL); 137 156 138 if($strIDs eq "")139 { 140 print STDERR "Error: No ID beingfound\n";157 if($strIDs eq "") 158 { 159 print STDERR "Error: No IDs found\n"; 141 160 return 0; 142 161 } 162 143 163 my $aryIDs = $self->parseOAIIDs($strIDs); 144 164 my $intIDs = 0; … … 155 175 $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc); 156 176 157 my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";158 &util::rm($tmp_file);177 # my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp"); 178 # &util::rm($tmp_file); 159 179 160 180 return 1; … … 164 184 { 165 185 my ($self,$strBasURL) = @_; 166 my ($cmdWget);186 ## my ($cmdWget); 167 187 168 188 my $wgetOptions = $self->getWgetOptions(); 169 189 170 $cmdWget = $wgetOptions;190 my $cmdWget = $wgetOptions; 171 191 172 192 print STDERR "Gathering OAI identifiers.....\n"; … … 256 276 sub getOAIDoc 257 277 { 258 my ($self,$strRecord, $ strSubDirPath) = @_;278 my ($self,$strRecord, $oai_rec_filename) = @_; 259 279 260 280 print STDERR "Gathering source documents.....\n"; … … 264 284 { 265 285 my $strMetaTag = $1; 266 267 if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s) 286 my $had_valid_url = 0; 287 288 while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is) 268 289 { 269 my $strDocURL = $2; 270 271 my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL); 272 273 my $strSoureDirPath =""; 274 275 $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs"); 276 277 &util::mk_dir($strSoureDirPath) if (!-e "$strSoureDirPath"); 278 279 my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile); 280 281 my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\""; 282 283 my $strResponse = $self->useWget($wget_cmd,1); 284 285 if($strResponse ne "") 290 my $doc_id_url = $2; 291 292 next if ($doc_id_url !~ m/^(https?|ftp):\/\//); 293 294 my $orig_doc_id_url = $doc_id_url; 295 $had_valid_url = 1; 296 297 my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/); 298 my $faked_ext = 0; 299 my $primary_doc_match = 0; 300 301 my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/); 302 303 if (defined $id_file_ext) { 304 # cross-check this filename extension with get_doc_exts option 305 # if provided 306 my $lookup_exts = $self->{'lookup_exts'}; 307 308 if (defined $lookup_exts->{lc($id_file_ext)}) { 309 # this initial URL matches requirement 310 $primary_doc_match = 1; 311 } 312 } 313 else { 314 $faked_ext = 1; 315 $id_file_ext = "html"; 316 } 317 318 319 if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) { 320 # Download this doc if HTML, scan through it looking for a link 321 # that does match get_doc_exts 322 323 324 # 1. Generate a tmp name 325 my $tmp_filename = &util::get_tmp_filename(); 326 327 # 2. Download it 328 my $wget_opts2 = $self->getWgetOptions(); 329 my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\""; 330 331 my ($stdout_and_err2,$error2,$follow2) = $self->useWgetMonitored($wget_cmd2); 332 333 if($error2 ne "") 334 { 335 print STDERR "Error occured while retrieving OAI source documents: $error2\n"; 336 exit(-1); 337 } 338 339 if (defined $follow2) { 340 # src url was "redirected" to another place 341 # => pick up on this and make it the new doc_id_url 342 $doc_id_url = $follow2; 343 } 344 345 my $primary_doc_html = ""; 346 if (open(HIN,"<$tmp_filename")) { 347 my $line; 348 while (defined ($line = <HIN>)) { 349 $primary_doc_html .= $line; 350 } 351 close(HIN); 352 353 # 3. Scan through it looking for match 354 # 355 # if got match, change $doc_id_url to this new URL and 356 # $id_file_ext to 'match' 357 358 my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi); 359 360 my $lookup_exts = $self->{'lookup_exts'}; 361 362 foreach my $href (@href_links) { 363 my ($ext) = ($href =~ m/\.([^\.]+)$/); 364 365 if ((defined $ext) && (defined $lookup_exts->{$ext})) { 366 367 if ($href !~ m/^(https?|ftp):\/\//) { 368 # link is within current site 369 my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//); 370 371 $href = "$site_domain$href"; 372 } 373 374 $doc_id_url = $href; 375 $id_file_ext = $ext; 376 last; 377 } 378 } 379 } 380 else { 381 print STDERR "Error occurred while retrieving OAI source documents:\n"; 382 print STDERR "$!\n"; 383 } 384 385 if (-e $tmp_filename) { 386 &util::rm($tmp_filename); 387 } 388 } 389 390 my $download_doc_filename = $oai_rec_filename; 391 $download_doc_filename =~ s/\.oai$/\.$id_file_ext/; 392 393 my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename); 394 395 my $wget_opts = $self->getWgetOptions(); 396 my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\""; 397 398 my ($stdout_and_err,$errors,$follow) = $self->useWgetMonitored($wget_cmd); 399 400 if($errors ne "") 286 401 { 287 print STDERR "Error occured while retriving OAI souce documents: $strResponse\n"; 402 print STDERR "Error occured while retriving OAI souce documents:\n"; 403 print STDERR "$errors\n"; 288 404 exit(-1); 289 405 } 290 406 291 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s; 407 408 $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s; 292 409 } 293 else 410 411 if (!$had_valid_url) 294 412 { 295 413 print STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n"; … … 300 418 print STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n"; 301 419 } 302 420 421 return $strRecord; 303 422 } 304 423 … … 313 432 foreach my $strID ( @$aryIDs) 314 433 { 315 print STDERR "Gathering OAI record with ID :$strID.....\n";434 print STDERR "Gathering OAI record with ID $strID.....\n"; 316 435 317 my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\""; 436 my $wget_opts = $self->getWgetOptions(); 437 my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\""; 318 438 319 439 my $strRecord = $self->useWget($cmdWget); 320 440 321 322 my @fileDirs = split(":",$strID); 441 my @fileDirs = split(":",$strID); 442 my $local_id = pop @fileDirs; 323 443 324 444 # setup directories … … 328 448 my $host =$self->{'url'}; 329 449 330 $host =~ s/http :\/\///g;450 $host =~ s/https?:\/\///g; 331 451 332 452 $host =~ s/:.*//g; 333 453 334 my $ midDir = join ("/",@fileDirs);335 my $strFileURL = "$strOutputDir/$host/".$midDir.".oai"; 454 my $strFileURL = "$strOutputDir/$host/$local_id.oai"; 455 336 456 337 457 # prepare subdirectory for record (if needed) … … 346 466 if($blnDownloadDoc) 347 467 { 348 $s elf->getOAIDoc($strRecord,$strSubDirPath);468 $strRecord = $self->getOAIDoc($strRecord,$strFileURL); 349 469 } 350 470 … … 378 498 my $strIdentify = "verb=Identify"; 379 499 my $strListSets = "verb=ListSets"; 500 my $strListMdFormats = "verb=ListMetadataFormats"; 380 501 381 502 my $strIdentifyCMD = $strBaseCMD; … … 392 513 print STDERR "General information:\n"; 393 514 $self->parse_xml($strIdentifyText); 515 print STDERR "\n"; 516 517 print STDERR "=" x 10, "\n"; 518 print STDERR "Metadata Format Information (metadataPrefix):\n"; 519 print STDERR "=" x 10, "\n"; 520 521 my $strListMdFormatsCMD = $strBaseCMD; 522 $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/; 523 my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD); 524 525 $self->parse_xml($strListMdFormatsText); 526 print STDERR "\n"; 527 528 print STDERR "=" x 10, "\n"; 529 print STDERR "List Information:\n"; 530 print STDERR "=" x 10, "\n"; 394 531 395 532 my $strListSetCMD = $strBaseCMD; … … 397 534 my $strListSetsText = $self->useWget($strListSetCMD); 398 535 399 400 print STDERR "List Information:\n";401 536 $self->parse_xml($strListSetsText); 402 537 } … … 405 540 { 406 541 my ($self) = shift (@_); 407 my ($ strOutputText) = @_;542 my ($xml_text) = @_; 408 543 544 #### change this to work directly from $xml_text 545 409 546 #Open a temporary file to store OAI information, and store the information to the temp file 410 my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";547 my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp"); 411 548 412 549 open(*OAIOUT,"> $name"); 413 550 414 print OAIOUT $ strOutputText;551 print OAIOUT $xml_text; 415 552 close(OAIOUT); 416 553 417 554 $self->{'temp_file_name'} = $name; 555 556 ## print STDERR "**** xml text = $xml_text\n"; 418 557 419 558 eval { 420 559 $self->{'parser'}->parsefile("$name"); 560 ## $self->{'parser'}->parse($xml_text); 421 561 }; 422 562 423 563 if ($@) { 424 die "OAI: $name is not a well formed XML file ($@)\n"; 425 } 426 } 427 428 END{ 429 if($self->{'info'}) 430 { 431 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!"; 432 } 433 } 564 die "OAI: Parsed file $name is not a well formed XML file ($@)\n"; 565 ## die "OAI: Parsed text is not a well formed XML file ($@)\n"; 566 } 567 568 unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!"; 569 } 570 571 ####END 572 #{ 573 # if($self->{'info'}) 574 # { 575 # unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!"; 576 # } 577 #} 434 578 435 579 # This Char function overrides the one in XML::Parser::Stream to overcome a … … 439 583 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+ 440 584 $_[0]->{'Text'} .= $_[1]; 585 586 my $self = $_[0]->{'PluginObj'}; 441 587 if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) { 442 588 $self->{'text'} .= $_[1]; … … 454 600 my ($expat, $element, %attr) = @_; 455 601 602 my $self = $expat->{'PluginObj'}; 456 603 $self->{'subfield'} = $element; 457 604 … … 461 608 { 462 609 my ($expat, $element) = @_; 610 611 my $self = $expat->{'PluginObj'}; 463 612 $self->{'text'} = ""; 464 613 $self->{'subfield'} = "";
Note:
See TracChangeset
for help on using the changeset viewer.