Changeset 20926

Show
Ignore:
Timestamp:
11.11.2009 11:28:33 (10 years ago)
Author:
kjdon
Message:

if can't download a source doc for some reason, move on to the next one. Don't quit, otherwise won't get any more of the records downloaded. This way, at least one gets the records even if can't get the source docs. doc urls may have ' in them, convert to ' before trying to download

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/downloaders/OAIDownload.pm

    r17668 r20926  
    295295    { 
    296296        my $doc_id_url = $2; 
    297  
     297        print STDERR "Found doc url: $doc_id_url\n"; 
    298298        next if ($doc_id_url !~ m/^(https?|ftp):\/\//); 
    299299 
     
    334334        my $wget_opts2 = $self->getWgetOptions(); 
    335335        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\""; 
    336  
    337336        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2); 
    338337        return $strRecord if $self->{'forced_quit'}; 
     
    340339        if($error2 ne "") 
    341340        { 
    342             print STDERR "Error occured while retrieving OAI source documents: $error2\n"; 
    343             exit(-1); 
     341            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n"; 
     342            #exit(-1); 
     343            next;  
    344344        } 
    345345 
     
    386386        } 
    387387        else { 
    388             print STDERR "Error occurred while retrieving OAI source documents:\n"; 
     388            print STDERR "Error occurred while retrieving OAI source documents (2):\n"; 
    389389            print STDERR "$!\n"; 
    390390        } 
     
    402402        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename); 
    403403 
     404        # may have ' in url - others?? 
     405        my $safe_doc_id_url = $doc_id_url; 
     406        $safe_doc_id_url =~ s/'/\'/g; 
     407 
    404408        my $wget_opts = $self->getWgetOptions(); 
    405         my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\""; 
    406  
     409        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\""; 
     410         
    407411        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd); 
    408412        return $strRecord if $self->{'forced_quit'}; 
     
    410414        if($errors ne "") 
    411415        { 
    412         print STDERR "Error occured while retriving OAI souce documents:\n"; 
     416        print STDERR "Error occured while retriving OAI souce documents (3):\n"; 
    413417        print STDERR "$errors\n"; 
    414         exit(-1); 
     418        #exit(-1); 
     419        next; 
    415420        } 
    416421 
    417422         
    418         $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s; 
     423        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s; 
    419424    } 
    420425 
     
    446451    my $wget_opts = $self->getWgetOptions(); 
    447452    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\""; 
    448  
     453     
    449454    my $strRecord =  $self->useWget($cmdWget); 
    450455