Changeset 20926 for gsdl


Ignore:
Timestamp:
2009-11-11T11:28:33+13:00 (14 years ago)
Author:
kjdon
Message:

if can't download a source doc for some reason, move on to the next one. Don't quit, otherwise won't get any more of the records downloaded. This way, at least one gets the records even if can't get the source docs. doc urls may have ' in them, convert to ' before trying to download

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/downloaders/OAIDownload.pm

    r17668 r20926  
    295295    {
    296296        my $doc_id_url = $2;
    297 
     297        print STDERR "Found doc url: $doc_id_url\n";
    298298        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
    299299
     
    334334        my $wget_opts2 = $self->getWgetOptions();
    335335        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
    336 
    337336        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
    338337        return $strRecord if $self->{'forced_quit'};
     
    340339        if($error2 ne "")
    341340        {
    342             print STDERR "Error occured while retrieving OAI source documents: $error2\n";
    343             exit(-1);
     341            print STDERR "Error occured while retrieving OAI source documents (1): $error2\n";
     342            #exit(-1);
     343            next;
    344344        }
    345345
     
    386386        }
    387387        else {
    388             print STDERR "Error occurred while retrieving OAI source documents:\n";
     388            print STDERR "Error occurred while retrieving OAI source documents (2):\n";
    389389            print STDERR "$!\n";
    390390        }
     
    402402        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
    403403
     404        # may have ' in url - others??
     405        my $safe_doc_id_url = $doc_id_url;
     406        $safe_doc_id_url =~ s/'/\'/g;
     407
    404408        my $wget_opts = $self->getWgetOptions();
    405         my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\"";
    406 
     409        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$safe_doc_id_url\"";
     410       
    407411        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
    408412        return $strRecord if $self->{'forced_quit'};
     
    410414        if($errors ne "")
    411415        {
    412         print STDERR "Error occured while retriving OAI souce documents:\n";
     416        print STDERR "Error occured while retriving OAI souce documents (3):\n";
    413417        print STDERR "$errors\n";
    414         exit(-1);
     418        #exit(-1);
     419        next;
    415420        }
    416421
    417422       
    418         $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
     423        $strRecord =~ s/<metadata>(.*?)<((?:dc:)?identifier)>$orig_doc_id_url<\/((?:dc:)?identifier)>(.*?)<\/metadata>/<metadata>$1<${2}>$orig_doc_id_url<\/${2}>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
    419424    }
    420425
     
    446451    my $wget_opts = $self->getWgetOptions();
    447452    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
    448 
     453   
    449454    my $strRecord =  $self->useWget($cmdWget);
    450455
Note: See TracChangeset for help on using the changeset viewer.