Changeset 26932


Ignore:
Timestamp:
2013-02-26T09:52:47+13:00 (11 years ago)
Author:
jmt12
Message:

Altered all calls to built-in perl file tests to instead use util library ones. This allows better awareness of HDFS or other strange file paths. Added support for newer version of manifest (where files are followed verbatim). Only write OIDcount for numerical OID collections

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm

    r25401 r26932  
    234234    # fill in the default import and archives directories if none
    235235    # were supplied, turn all \ into / and remove trailing /
    236     $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import") if $importdir eq "";
    237     $importdir =~ s/[\\\/]+/\//g;
    238     $importdir =~ s/\/$//;
    239     if (!-e $importdir) {
     236    if ($importdir eq "")
     237    {
     238      $importdir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "import");
     239    }
     240    else
     241    {
     242      # hijack filename_cat to sanitize the user provided importdir [jmt12]
     243      $importdir = &util::filename_cat ($importdir);
     244    }
     245    if (!&util::dir_exists($importdir)) {
    240246    &gsprintf($out, "{import.no_import_dir}\n\n", $importdir);
    241247    die "\n";
     
    256262    }
    257263    }
    258 
    259     $archivedir =~ s/[\\\/]+/\//g;
    260     $archivedir =~ s/\/$//;
     264    else
     265    {
     266      # use filename_cat() to sanitize the user provided archive directory as
     267      # it is more aware of protocols etc
     268      $archivedir = &util::filename_cat($archivedir);
     269    }
    261270    $self->{'archivedir'} = $archivedir;
    262271
     
    273282    $self->{'manifest'} = $collectcfg->{'manifest'};
    274283    }
     284    # Default value
     285    $self->{'manifest_version'} = 0;
    275286
    276287    if (defined $collectcfg->{'gzip'} && !$self->{'gzip'}) {
     
    427438
    428439    $manifest_lookup->parse($manifest_filename);
     440
     441        # Manifests may now include a version number
     442        $self->{'manifest_version'} = $manifest_lookup->get_version();
    429443    }
    430444
     
    455469
    456470    if ($removeold) {
    457     if (-e $archivedir) {
     471    if (&util::dir_exists($archivedir)) {
    458472        &gsprintf($out, "{import.removing_archives}\n");
    459473        &util::rm_r ($archivedir);
     
    462476    $tmpdir =~ s/[\\\/]+/\//g;
    463477    $tmpdir =~ s/\/$//;
    464     if (-e $tmpdir) {
    465         #&gsprintf($out, "{import.removing_tmpdir}\n");
    466         #&util::rm_r ($tmpdir);
     478    if (&util::dir_exists($tmpdir)) {
     479        &gsprintf($out, "{import.removing_tmpdir}\n");
     480        &util::rm_r ($tmpdir);
    467481    }
    468482    }
     
    481495    # and attach themselves as a listener (even though they don't do anything)
    482496    # This is done so that, in parallel importing, the server will persist
    483     # until the top level import.pl (which will be the first this that calls
    484     # this function) completes. [jmt12]
     497    # until the top level import.pl (which will be the first that calls this
     498    # function) completes. [jmt12]
    485499    my $create_server = 0;
    486500    # - infodb's of type *server need to be started on the same machine that
     
    562576    }
    563577
    564     my $processor = &plugout::load_plugout($plugout);                       
     578    my $processor = &plugout::load_plugout($plugout);
    565579    $processor->setoutputdir ($archivedir);
    566580    $processor->set_sortmeta ($sortmeta, $removeprefix, $removesuffix) if defined $sortmeta;
     
    714728
    715729      # If we are not using complex inherited metadata (and thus have skipped
    716       # the global file scan) we need to at least scan the directory of the
    717       # files being indexed/reindexed. [jmt12]
    718       if (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true')
     730      # the global file scan) we need to at least check for a matching
     731      # metadata.xml for the files being indexed/reindexed. [jmt12]
     732      # - unless we are using the newer version of Manifests, which are treated
     733      #   verbatim, and should have a metadata element for metadata files (so
     734      #   we can explicitly process metadata files other than metadata.xml)
     735      if ($self->{'manifest_version'} < 1 && (!defined $collectcfg->{'complexmeta'} || $collectcfg->{'complexmeta'} ne 'true'))
    719736      {
    720737        my @all_files_to_import = (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}});
    721738        foreach my $file_to_import (@all_files_to_import)
    722739        {
    723           my $dir_to_import = $file_to_import;
    724           $dir_to_import =~ s/[^\\\/]*$//;
    725           # - one day we may need to manually scan this directory for child
    726           #   directories and somehow explicitly block them from being
    727           #   recursed.
    728           if (-d $dir_to_import)
     740          my $metadata_xml_path = $file_to_import;
     741          $metadata_xml_path =~ s/[^\\\/]*$/metadata.xml/;
     742          if (&util::file_exists($metadata_xml_path))
    729743          {
    730             &plugin::file_block_read($pluginfo, $dir_to_import, '', $block_hash, $metadata, $gli);
     744            &plugin::file_block_read($pluginfo, '', $metadata_xml_path, $block_hash, $metadata, $gli);
    731745          }
     746        }
     747      }
     748
     749      if ($self->{'manifest_version'} > 0)
     750      {
     751        # Process metadata files (?)
     752        # Process files
     753        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
     754        {
     755          &plugin::read ($pluginfo, '', $file_to_import, $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
    732756        }
    733757      }
     
    823847                          $self->{'collection'}, $self->{'site'});
    824848    }
    825     else
     849    # only do this if we aren't using the newer paradigm for manifest files
     850    elsif ($self->{'manifest_version'} < 1)
    826851    {
    827852    &plugin::read ($pluginfo, $importdir, "", $block_hash, $metadata, $processor, $maxdocs, 0, $gli);
     
    852877    # Store the value of OIDCount (used in doc.pm) so it can be
    853878    # restored correctly to this value on an incremental build
    854     store_doc_oid_count($archivedir);
     879    # - this OIDcount file should only be generated for numerical oids [jmt12]
     880    if ($self->{'OIDtype'} eq 'incremental')
     881    {
     882      store_doc_oid_count($archivedir);
     883    }
    855884
    856885    # write out the archive information file
     
    9921021
    9931022
    994     if (open(OIDOUT,">$oid_count_filename")) {
     1023    if (open(OIDOUT,&util::file_openfdcommand($oid_count_filename, '>'))) {
    9951024    print OIDOUT $doc::OIDcount, "\n";
    9961025       
Note: See TracChangeset for help on using the changeset viewer.