Changeset 26932

Show
Ignore:
Timestamp:
26.02.2013 09:52:47 (7 years ago)
Author:
jmt12
Message:

Altered all calls to built-in perl file tests to instead use util library ones. This allows better awareness of HDFS or other strange file paths. Added support for newer version of manifest (where files are followed verbatim). Only write OIDcount for numerical OID collections

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm

    r25401 r26932  
    234234    # fill in the default import and archives directories if none 
    235235    # were supplied, turn all \ into / and remove trailing / 
    236     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq ""; 
    237     $importdir =~ s/[\\\/]+/\//g; 
    238     $importdir =~ s/\/$//; 
    239     if (!-e $importdir) { 
     236    if ($importdir eq "") 
     237    { 
     238      $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import"); 
     239    } 
     240    else 
     241    { 
     242      # hijack filename_cat to sanitize the user provided importdir [jmt12] 
     243      $importdir = &util::filename_cat ($importdir); 
     244    } 
     245    if (!&util::dir_exists($importdir)) { 
    240246    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir); 
    241247    die "\n"; 
     
    256262    } 
    257263    } 
    258  
    259     $archivedir =~ s/[\\\/]+/\//g; 
    260     $archivedir =~ s/\/$//; 
     264    else 
     265    { 
     266      # use filename_cat() to sanitize the user provided archive directory as 
     267      # it is more aware of protocols etc 
     268      $archivedir = &util::filename_cat($archivedir); 
     269    } 
    261270    $self->{'archivedir'} = $archivedir; 
    262271 
     
    273282    $self->{'manifest'} = $collectcfg->{'manifest'}; 
    274283    } 
     284    # Default value 
     285    $self->{'manifest_version'} = 0; 
    275286 
    276287    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) { 
     
    427438 
    428439    $manifest_lookup->parse($manifest_filename); 
     440 
     441        # Manifests may now include a version number 
     442        $self->{'manifest_version'} = $manifest_lookup->get_version(); 
    429443    } 
    430444 
     
    455469 
    456470    if ($removeold) { 
    457     if (-e $archivedir) { 
     471    if (&util::dir_exists($archivedir)) { 
    458472        &gsprintf($out, "{import.removing_archives}\n"); 
    459473        &util::rm_r ($archivedir); 
     
    462476    $tmpdir =~ s/[\\\/]+/\//g; 
    463477    $tmpdir =~ s/\/$//; 
    464     if (-e $tmpdir) { 
    465         #&gsprintf($out, "{import.removing_tmpdir}\n"); 
    466         #&util::rm_r ($tmpdir); 
     478    if (&util::dir_exists($tmpdir)) { 
     479        &gsprintf($out, "{import.removing_tmpdir}\n"); 
     480        &util::rm_r ($tmpdir); 
    467481    } 
    468482    } 
     
    481495    # and attach themselves as a listener (even though they don't do anything) 
    482496    # This is done so that, in parallel importing, the server will persist 
    483     # until the top level import.pl (which will be the first this that calls 
    484     # this function) completes. [jmt12] 
     497    # until the top level import.pl (which will be the first that calls this 
     498    # function) completes. [jmt12] 
    485499    my $create_server = 0; 
    486500    # - infodb's of type *server need to be started on the same machine that 
     
    562576    } 
    563577 
    564     my $processor = &plugout::load_plugout($plugout);                         
     578    my $processor = &plugout::load_plugout($plugout); 
    565579    $processor->setoutputdir ($archivedir); 
    566580    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta; 
     
    714728 
    715729      # If we are not using complex inherited metadata (and thus have skipped 
    716       # the global file scan) we need to at least scan the directory of the 
    717       # files being indexed/reindexed. [jmt12] 
    718       if (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true') 
     730      # the global file scan) we need to at least check for a matching 
     731      # metadata.xml for the files being indexed/reindexed. [jmt12] 
     732      # - unless we are using the newer version of Manifests, which are treated 
     733      #   verbatim, and should have a metadata element for metadata files (so 
     734      #   we can explicitly process metadata files other than metadata.xml) 
     735      if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true')) 
    719736      { 
    720737        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}); 
    721738        foreach my $file_to_import (@all_files_to_import) 
    722739        { 
    723           my $dir_to_import = $file_to_import; 
    724           $dir_to_import =~ s/[^\\\/]*$//; 
    725           # - one day we may need to manually scan this directory for child 
    726           #   directories and somehow explicitly block them from being 
    727           #   recursed. 
    728           if (-d $dir_to_import) 
     740          my $metadata_xml_path = $file_to_import; 
     741          $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/; 
     742          if (&util::file_exists($metadata_xml_path)) 
    729743          { 
    730             &plugin::file_block_read($pluginfo, $dir_to_import, '', $block_hash, $metadata, $gli); 
     744            &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli); 
    731745          } 
     746        } 
     747      } 
     748 
     749      if ($self->{'manifest_version'} > 0) 
     750      { 
     751        # Process metadata files (?) 
     752        # Process files 
     753        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}) 
     754        { 
     755          &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
    732756        } 
    733757      } 
     
    823847                          $self->{'collection'}, $self->{'site'});  
    824848    } 
    825     else 
     849    # only do this if we aren't using the newer paradigm for manifest files 
     850    elsif ($self->{'manifest_version'} < 1) 
    826851    { 
    827852    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli); 
     
    852877    # Store the value of OIDCount (used in doc.pm) so it can be 
    853878    # restored correctly to this value on an incremental build 
    854     store_doc_oid_count($archivedir); 
     879    # - this OIDcount file should only be generated for numerical oids [jmt12] 
     880    if ($self->{'OIDtype'} eq 'incremental') 
     881    { 
     882      store_doc_oid_count($archivedir); 
     883    } 
    855884 
    856885    # write out the archive information file 
     
    9921021 
    9931022 
    994     if (open(OIDOUT,">$oid_count_filename")) { 
     1023    if (open(OIDOUT,&util::file_openfdcommand($oid_count_filename, '>'))) { 
    9951024    print OIDOUT $doc::OIDcount, "\n"; 
    9961025