Changeset 16791


Ignore:
Timestamp:
2008-08-14T16:42:19+12:00 (16 years ago)
Author:
davidb
Message:

Improvement to downloading capabilities for WGet and OAI

Location:
gsdl/trunk/perllib/downloaders
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/downloaders/OAIDownload.pm

    r16725 r16791  
    6666    'type' => "flag",
    6767    'reqd' => "no"},
     68      { 'name' => "get_doc_exts",
     69    'disp' => "{OAIDownload.get_doc_exts_disp}",
     70    'desc' => "{OAIDownload.get_doc_exts}",
     71    'type' => "string",
     72    'deft' => "doc,pdf,ppt",
     73    'reqd' => "no"},
    6874      { 'name' => "max_records",
    6975    'disp' => "{OAIDownload.max_records_disp}",
     
    8086        'args'     => $arguments };
    8187
    82 my $self;
    83 
    84 my $strWgetOptions="";
     88##my $self;
     89
     90#### my $strWgetOptions="";
    8591
    8692sub new
     
    9399    if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
    94100
    95     $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
     101    my $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
    96102
    97103    if ($self->{'info_only'}) {
     
    101107
    102108    my $parser = new XML::Parser('Style' => 'Stream',
     109                                 'PluginObj' => $self,
    103110                 'Handlers' => {'Char' => \&Char,
    104111                        'Start' => \&OAI_StartTag,
     
    113120    }
    114121   
     122
     123    # set up hashmap for individual items in get_doc_exts
     124    # to make testing for matches easier
     125
     126    $self->{'lookup_exts'} = {};
     127    my $get_doc_exts = $self->{'get_doc_exts'};
     128
     129    if ((defined $get_doc_exts) && ($get_doc_exts ne "")) {
     130    my @exts = split(/,\s*/,$get_doc_exts);
     131    foreach my $e (@exts) {
     132        $self->{'lookup_exts'}->{lc($e)} = 1;
     133    }
     134    }
     135
     136
    115137    return bless $self, $class;
    116138}
     
    121143    my ($hashGeneralOptions) = @_;
    122144
    123 ##    print STDERR "here2";
    124    
    125     $strWgetOptions = $self->getWgetOptions();
    126     my $cmdWget = $strWgetOptions;
     145##    my $cmdWget = $strWgetOptions;
    127146 
    128147    my $strOutputDir ="";
     
    136155    my $strIDs = $self->getOAIIDs($strBasURL);
    137156 
    138    if($strIDs eq "")
    139     {
    140     print STDERR "Error: No ID being found\n";
     157    if($strIDs eq "")
     158    {
     159    print STDERR "Error: No IDs found\n";
    141160    return 0;
    142161    }
     162
    143163    my $aryIDs = $self->parseOAIIDs($strIDs);
    144164    my $intIDs = 0;
     
    155175    $self->getOAIRecords($aryIDs, $strOutputDir, $strBasURL, $intMaxRecords, $blnDownloadDoc);
    156176
    157     my $tmp_file = "$ENV{GSDLHOME}/tmp/oai.tmp";
    158     &util::rm($tmp_file);
     177#    my $tmp_file = &util::filename_cat($ENV{'GSDLHOME'},"tmp","oai.tmp");
     178#    &util::rm($tmp_file);
    159179
    160180    return 1;
     
    164184{
    165185    my ($self,$strBasURL) = @_;
    166     my ($cmdWget);
     186##    my ($cmdWget);
    167187     
    168188    my $wgetOptions = $self->getWgetOptions();
    169189
    170     $cmdWget = $wgetOptions;
     190    my $cmdWget = $wgetOptions;
    171191 
    172192    print STDERR  "Gathering OAI identifiers.....\n";
     
    256276sub getOAIDoc
    257277{
    258     my ($self,$strRecord, $strSubDirPath) = @_;
     278    my ($self,$strRecord, $oai_rec_filename) = @_;
    259279 
    260280    print  STDERR "Gathering source documents.....\n";
     
    264284    {
    265285    my $strMetaTag = $1;
    266    
    267     if ($strMetaTag =~ m/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>/s)
     286    my $had_valid_url = 0;
     287
     288    while ($strMetaTag =~ s/<(dc:)?identifier>(.*?)<\/(dc:)?identifier>//is)
    268289    {
    269         my $strDocURL = $2;
    270 
    271         my ($unused,$strDocFile) = $self->dirFileSplit($strDocURL);
    272 
    273             my $strSoureDirPath ="";
    274 
    275         $strSoureDirPath = &util::filename_cat($strSubDirPath,"srcdocs");
    276 
    277         &util::mk_dir($strSoureDirPath)  if (!-e "$strSoureDirPath");
    278        
    279         my $strFullDocFilePath = &util::filename_cat($strSoureDirPath,$strDocFile);
    280        
    281         my $wget_cmd = $strWgetOptions." -q -O \"$strFullDocFilePath\" \"$strDocURL\"";
    282 
    283         my $strResponse =  $self->useWget($wget_cmd,1);
    284 
    285         if($strResponse ne "")
     290        my $doc_id_url = $2;
     291
     292        next if ($doc_id_url !~ m/^(https?|ftp):\/\//);
     293
     294        my $orig_doc_id_url = $doc_id_url;
     295        $had_valid_url = 1;
     296
     297        my ($doc_dir_url_prefix,$doc_id_tail) = ($doc_id_url =~ m/^(.*)\/(.*?)$/);
     298        my $faked_ext = 0;
     299        my $primary_doc_match = 0;
     300
     301        my ($id_file_ext) = ($doc_id_tail =~ m/\.([^\.]+)$/);
     302
     303        if (defined $id_file_ext) {
     304        # cross-check this filename extension with get_doc_exts option
     305        # if provided
     306        my $lookup_exts = $self->{'lookup_exts'};
     307
     308        if (defined $lookup_exts->{lc($id_file_ext)}) {
     309            # this initial URL matches requirement
     310            $primary_doc_match = 1;
     311        }
     312        }
     313        else {
     314        $faked_ext = 1;
     315        $id_file_ext = "html";
     316        }
     317
     318       
     319        if ((!$primary_doc_match) && ($id_file_ext =~ m/^html?$/i)) {
     320        # Download this doc if HTML, scan through it looking for a link
     321        # that does match get_doc_exts
     322       
     323
     324        # 1. Generate a tmp name
     325        my $tmp_filename = &util::get_tmp_filename();
     326
     327        # 2. Download it
     328        my $wget_opts2 = $self->getWgetOptions();
     329        my $wget_cmd2 = "$wget_opts2 --convert-links -O \"$tmp_filename\" \"$doc_id_url\"";
     330
     331        my ($stdout_and_err2,$error2,$follow2) =  $self->useWgetMonitored($wget_cmd2);
     332
     333        if($error2 ne "")
     334        {
     335            print STDERR "Error occured while retrieving OAI source documents: $error2\n";
     336            exit(-1);
     337        }
     338
     339        if (defined $follow2) {
     340            # src url was "redirected" to another place
     341            # => pick up on this and make it the new doc_id_url
     342            $doc_id_url = $follow2;
     343        }
     344
     345        my $primary_doc_html = "";
     346        if (open(HIN,"<$tmp_filename")) {
     347            my $line;
     348            while (defined ($line = <HIN>)) {
     349            $primary_doc_html .= $line;
     350            }
     351            close(HIN);
     352
     353            # 3. Scan through it looking for match
     354            #
     355            # if got match, change $doc_id_url to this new URL and
     356            # $id_file_ext to 'match'
     357           
     358            my @href_links = ($primary_doc_html =~ m/href="(.*?)"/gsi);
     359
     360            my $lookup_exts = $self->{'lookup_exts'};
     361
     362            foreach my $href (@href_links) {
     363            my ($ext) = ($href =~ m/\.([^\.]+)$/);
     364
     365            if ((defined $ext) && (defined $lookup_exts->{$ext})) {
     366
     367                if ($href !~ m/^(https?|ftp):\/\//) {
     368                # link is within current site
     369                my ($site_domain) = ($doc_id_url =~ m/^((?:https?|ftp):\/\/.*?)\//);
     370
     371                $href = "$site_domain$href";
     372                }
     373
     374                $doc_id_url = $href;
     375                $id_file_ext = $ext;
     376                last;
     377            }
     378            }
     379        }
     380        else {
     381            print STDERR "Error occurred while retrieving OAI source documents:\n";
     382            print STDERR "$!\n";
     383        }
     384
     385        if (-e $tmp_filename) {
     386            &util::rm($tmp_filename);
     387        }
     388        }
     389
     390        my $download_doc_filename = $oai_rec_filename;
     391        $download_doc_filename =~ s/\.oai$/\.$id_file_ext/;
     392
     393        my ($unused,$download_doc_file) = $self->dirFileSplit($download_doc_filename);
     394
     395        my $wget_opts = $self->getWgetOptions();
     396        my $wget_cmd = "$wget_opts --convert-links -O \"$download_doc_filename\" \"$doc_id_url\"";
     397
     398        my ($stdout_and_err,$errors,$follow) =  $self->useWgetMonitored($wget_cmd);
     399
     400        if($errors ne "")
    286401        {
    287         print STDERR "Error occured while retriving OAI souce documents: $strResponse\n";
     402        print STDERR "Error occured while retriving OAI souce documents:\n";
     403        print STDERR "$errors\n";
    288404        exit(-1);
    289405        }
    290406
    291         $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$strDocURL<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<OrigURL>$strDocURL<\/OrigURL>\n   <identifier>srcdocs\/$strDocFile<\/identifier>$4<\/metadata>/s;
     407       
     408        $strRecord =~ s/<metadata>(.*?)<(dc:)?identifier>$orig_doc_id_url<\/(dc:)?identifier>(.*?)<\/metadata>/<metadata>$1<${2}identifier>$orig_doc_id_url<\/${2}identifier>\n   <gi.Sourcedoc>$download_doc_file<\/gi.Sourcedoc>$4<\/metadata>/s;
    292409    }
    293     else
     410
     411    if (!$had_valid_url)
    294412    {
    295413        print  STDERR "\tNo souce document URL is specified in the OAI record (No (dc:)?identifier is provided)\n";
     
    300418    print  STDERR "\tNo souce document URL is specified in the OAI record (No metadata field is provided)\n";
    301419    }
    302    
     420
     421    return $strRecord;
    303422}
    304423
     
    313432    foreach my $strID ( @$aryIDs)
    314433    {
    315     print  STDERR "Gathering OAI record with ID:$strID.....\n";
     434    print  STDERR "Gathering OAI record with ID $strID.....\n";
    316435       
    317     my $cmdWget= $strWgetOptions." -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
     436    my $wget_opts = $self->getWgetOptions();
     437    my $cmdWget= "$wget_opts -q -O - \"$strBasURL?verb=GetRecord&metadataPrefix=$metadata_prefix&identifier=$strID\"";
    318438
    319439    my $strRecord =  $self->useWget($cmdWget);
    320440
    321        
    322         my @fileDirs = split(":",$strID); 
     441    my @fileDirs = split(":",$strID); 
     442    my $local_id = pop @fileDirs;
    323443
    324444    # setup directories
     
    328448        my $host =$self->{'url'};
    329449 
    330         $host =~ s/http:\/\///g;
     450        $host =~ s/https?:\/\///g;
    331451
    332452        $host =~ s/:.*//g;
    333453
    334     my $midDir = join ("/",@fileDirs);
    335     my $strFileURL = "$strOutputDir/$host/".$midDir.".oai";
     454    my $strFileURL = "$strOutputDir/$host/$local_id.oai";
     455
    336456
    337457    # prepare subdirectory for record (if needed)
     
    346466    if($blnDownloadDoc)
    347467    {
    348         $self->getOAIDoc($strRecord,$strSubDirPath);
     468        $strRecord = $self->getOAIDoc($strRecord,$strFileURL);
    349469    }
    350470
     
    378498    my $strIdentify = "verb=Identify";
    379499    my $strListSets = "verb=ListSets";
     500    my $strListMdFormats = "verb=ListMetadataFormats";
    380501
    381502    my $strIdentifyCMD = $strBaseCMD;
     
    392513    print STDERR "General information:\n";
    393514    $self->parse_xml($strIdentifyText);
     515    print STDERR "\n";
     516
     517    print STDERR "=" x 10, "\n";
     518    print STDERR "Metadata Format Information (metadataPrefix):\n";
     519    print STDERR "=" x 10, "\n";
     520
     521    my $strListMdFormatsCMD = $strBaseCMD;
     522    $strListMdFormatsCMD =~ s/_OPTS_/$strListMdFormats/;   
     523    my $strListMdFormatsText = $self->useWget($strListMdFormatsCMD);
     524
     525    $self->parse_xml($strListMdFormatsText);
     526    print STDERR "\n";
     527
     528    print STDERR "=" x 10, "\n";
     529    print STDERR "List Information:\n";
     530    print STDERR "=" x 10, "\n";
    394531
    395532    my $strListSetCMD = $strBaseCMD;
     
    397534    my $strListSetsText = $self->useWget($strListSetCMD);
    398535
    399 
    400     print STDERR "List Information:\n";
    401536    $self->parse_xml($strListSetsText);
    402537}
     
    405540{   
    406541    my ($self) = shift (@_);
    407     my ($strOutputText) = @_;
     542    my ($xml_text) = @_;
    408543   
     544    #### change this to work directly from $xml_text
     545
    409546    #Open a temporary file to store OAI information, and store the information to the temp file
    410     my $name = "$ENV{GSDLHOME}/tmp/oai.tmp";
     547    my $name = &util::filename_cat($ENV{GSDLHOME},"tmp","oai.tmp");
    411548
    412549    open(*OAIOUT,"> $name");
    413550   
    414     print OAIOUT $strOutputText;
     551    print OAIOUT $xml_text;
    415552    close(OAIOUT);
    416553
    417554    $self->{'temp_file_name'} = $name;
     555
     556##    print STDERR "**** xml text = $xml_text\n";
    418557
    419558    eval {
    420559    $self->{'parser'}->parsefile("$name");
     560##  $self->{'parser'}->parse($xml_text);
    421561    };
    422562   
    423563    if ($@) {
    424     die "OAI: $name is not a well formed XML file ($@)\n";
    425     }
    426 }
    427 
    428 END{
    429     if($self->{'info'})
    430     {
    431     unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
    432     }
    433 }
     564    die "OAI: Parsed file $name is not a well formed XML file ($@)\n";
     565##  die "OAI: Parsed text is not a well formed XML file ($@)\n";
     566    }
     567
     568    unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
     569}
     570
     571####END
     572#{
     573#    if($self->{'info'})
     574#    {
     575#   unlink($self->{'temp_file_name'}) or die "Could not unlink $self->{'temp_file_name'}: $!";
     576#    }
     577#}
    434578
    435579# This Char function overrides the one in XML::Parser::Stream to overcome a
     
    439583    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
    440584    $_[0]->{'Text'} .= $_[1];
     585
     586    my $self = $_[0]->{'PluginObj'};
    441587    if ((defined $self->{'subfield'} && ($self->{'subfield'} ne ""))) {
    442588    $self->{'text'} .= $_[1];
     
    454600    my ($expat, $element, %attr) = @_;
    455601
     602    my $self = $expat->{'PluginObj'};
    456603    $self->{'subfield'} = $element;
    457604   
     
    461608{
    462609    my ($expat, $element) = @_;
     610
     611    my $self = $expat->{'PluginObj'};
    463612    $self->{'text'} = "";
    464613    $self->{'subfield'} = "";
  • gsdl/trunk/perllib/downloaders/WgetDownload.pm

    r14918 r16791  
    105105    my $strOptions = "";
    106106   
    107     if($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
     107    if ($self->{'proxy_on'} && $self->{'proxy_host'} && $self->{'proxy_port'})
    108108    {
    109109
     
    116116    }
    117117
    118     $strOptions .= " -Y on ";
     118    if ($self->{'proxy_on'}) {
     119    $strOptions .= " --proxy ";
     120    }
    119121
    120122    return $strOptions;
     
    167169}
    168170
     171
     172sub useWgetMonitored
     173{
     174    my ($self, $cmdWget,$blnShow, $working_dir) = @_;
     175
     176
     177    my $current_dir = cwd();
     178    my $changed_dir = 0;
     179    if (defined $working_dir && -e $working_dir) {
     180    chdir "$working_dir";
     181    $changed_dir = 1;
     182    }
     183    my $wget_file_path = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wget");
     184    my $command = "\"$wget_file_path\" $cmdWget 2>&1 |";
     185
     186###    print STDERR "**** wget cmd = $command\n";
     187
     188    open(*WIN,$command) || die "wget request failed: $!\n";
     189
     190    my $full_text = "";
     191    my $error_text = "";
     192    my @follow_list = ();
     193    my $line;
     194
     195    while (defined($line=<WIN>))
     196    {
     197    if((defined $blnShow) && $blnShow)
     198    {
     199        print STDERR "$line";
     200    }
     201
     202    if ($line =~ m/^Location:\s*(.*?)\s*\[following\]\s*$/i) {
     203        my $follow_url = $1;
     204        push(@follow_list,$follow_url);
     205    }
     206
     207    if ($line =~ m/ERROR\s+\d+/) {
     208        $error_text .= $line;
     209    }
     210
     211    $full_text .= $line;
     212    }
     213
     214    close(WIN);
     215
     216    my $command_status = $?;
     217    if ($command_status != 0) {
     218    $error_text .= "Exit error: $command_status";
     219    }
     220
     221    if ($changed_dir) {
     222    chdir $current_dir;
     223    }
     224   
     225    my $final_follow = pop(@follow_list); # might be undefined, but that's OK
     226   
     227    return ($full_text,$error_text,$final_follow);
     228}
     229
     230
    169231# TODO: Check if the URL is valid?? Not sure what should be in this function yet!!
    170232sub checkURL
Note: See TracChangeset for help on using the changeset viewer.