Changeset 30354


Ignore:
Timestamp:
2015-12-16T16:15:39+13:00 (8 years ago)
Author:
jmt12
Message:

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

Location:
gs2-extensions/parallel-building/trunk/src
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/generate_gantt.pl

    r29663 r30354  
    44use strict;
    55use warnings;
     6
     7BEGIN
     8{
     9    if ( !defined $ENV{'GEXTPARALLELBUILDING_INSTALLED'}) {
     10    die "GEXTPARALLELBUILDING_INSTALLED not set\n";
     11    }
     12    # Installed CPAN packages for GEXT*INSTALL
     13    my $perl_version = `perl-version.pl`;
     14    my $perl_path = sprintf("%s/lib/perl/%s", $ENV{'GEXTPARALLELBUILDING_INSTALLED'}, $perl_version);
     15    unshift (@INC, $perl_path);
     16}
    617
    718# Modules
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r28015 r30354  
    3333my $hdfs_fs_prefix = 'HDThriftFS://';
    3434my $refresh_import = 0;
     35my $remove_old = 0;
    3536my $username = `whoami`;
    3637chomp($username);
     
    4950else
    5051{
    51   print STDERR "usage: hadoop_import.pl <collection> [-debug] [-disable_thrift] [-dry_run] [-start_thrift] [-refresh_import] [-flush_diskcache] [-use_nfs] [-stagger]\n";
     52  print STDERR "usage: hadoop_import.pl <collection> [-debug] [-enable_thrift] [-dry_run] [-start_thrift] [-refresh_import] [-flush_diskcache] [-use_nfs] [-stagger] [-removeold]\n";
    5253  print STDERR "where: [debug] print more debug messages to STDERR\n";
    5354  print STDERR "       [dry_run] don't actually perform an file actions\n";
     
    6162    $debug = 1;
    6263  }
    63   if ($ARGV[$offset] eq '-disable_thrift')
    64   {
    65     $use_thrift = 0;
     64  if ($ARGV[$offset] eq '-enable_thrift')
     65  {
     66    $use_thrift = 1;
    6667  }
    6768  if ($ARGV[$offset] eq '-dry_run')
     
    8889  {
    8990    $use_nfs = 1;
     91  }
     92  if ($ARGV[$offset] eq '-removeold')
     93  {
     94    $remove_old = 1;
    9095  }
    9196  if ($ARGV[$offset] eq '-logdir')
  • gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm

    r27525 r30354  
    249249}
    250250## isHDFS()
     251
     252
     253## @function isSpecialDirectory
     254#
     255sub isSpecialDirectory
     256{
     257    my ($path) = @_;
     258    return ($path =~ /^HDFSShell:\/\/[a-zA-Z]+:\d+$/);
     259}
     260## isSpecialDirectory()
    251261
    252262
     
    318328  my @files;
    319329  my $result = &_executeHDFSCommand(1, 'ls', $path);
     330  if ($result =~ /No such file or directory/)
     331  {
     332      print STDERR "BOOM! BOOM! BOOM!\n";
     333      return undef;
     334  }
    320335  my @lines = split(/\r?\n/, $result);
    321336  foreach my $line (@lines)
  • gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm

    r30292 r30354  
    190190    my $self = { 'xml' => 0, 'mode' => $mode };
    191191
    192     print "INFO: This inexport.pm supports version 2 manifest files\n";
    193 
    194192    # general options available to all plugins
    195193    my $arguments = $options->{'args'};
     
    201199    print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
    202200    die "\n";
     201    }
     202
     203    if ($self->{'verbosity'} > 2) {
     204    print "[INFO] This inexport.pm supports version 2 manifest files\n";
     205    }
     206    if ($self->{'verbosity'} > 3) {
     207    print '[DEBUG] Perl @INC: ' . join(", ", @INC) . "\n";
    203208    }
    204209
     
    735740    else
    736741    {
    737       print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n";
     742      print STDERR "Skipping import directory-level global file scan due to manifest and complexmeta configuration\n";
    738743    }
    739744
     
    757762        : &FileUtils::filenameConcatenate($importdir,$df);
    758763
    759         if (-d $full_df) {
     764        if (-d $full_df && $self->{'manifest_version'} != 2) {
    760765        &add_dir_contents_to_list($full_df, \@full_deleted_files);
    761766        } else {
     
    783788        : &FileUtils::filenameConcatenate($importdir,$rf);
    784789
    785         if (-d $full_rf) {
     790        if (-d $full_rf && $self->{'manifest_version'} != 2) {
    786791        &add_dir_contents_to_list($full_rf, \@full_reindex_files);
    787792        } else {
     
    814819        : &FileUtils::filenameConcatenate($importdir,$nf);
    815820
    816         if (-d $full_nf) {
     821        if (-d $full_nf && $self->{'manifest_version'} != 2) {
    817822        &add_dir_contents_to_list($full_nf, \@full_new_files);
    818823        } else {
     
    885890        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}})
    886891        {
    887         $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli);
     892        if (&FileUtils::directoryExists($file_to_import)) {
     893#       print "DEBUG: Directory to import: \"" . $file_to_import . "\"\n";
     894        &plugin::file_block_read($pluginfo, '', $file_to_import, $block_hash, $metadata, $gli);
     895#       print "\n===== BLOCK HASH =====\n";
     896#       Dump($block_hash);
     897#       print "\n=====            =====\n\n";
     898        $self->perform_process_files($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs);
     899        }
     900        else
     901        {
     902#       print "DEBUG: File to import: \"" . $file_to_import . "\"\n";
     903        $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli);
     904        }
    888905        }
    889906      }
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java

    r27654 r30354  
    262262
    263263      // - call Greenstone passing in the path to the manifest
    264       ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
     264      //ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
     265      String environment_script_filename = "setup.bash";
     266      StringBuffer cmd_buffer = new StringBuffer();
     267      cmd_buffer.append("source ./");
     268      cmd_buffer.append(environment_script_filename);
     269      cmd_buffer.append(" && time -p import.pl -keepold -manifest \"");
     270      cmd_buffer.append(manifest_path.toString());
     271      cmd_buffer.append("\" -archivedir \"");
     272      cmd_buffer.append(conf.get("archivesdir"));
     273      cmd_buffer.append("\" ");
     274      cmd_buffer.append(collection);
     275      ProcessBuilder import_process_builder = new ProcessBuilder("bash", "-c", cmd_buffer.toString());
    265276      fw1.write("[Command:" + import_process_builder.command() + "]\n");
     277      /*
    266278      // - alter environment
    267279      Map<String, String> import_process_env = import_process_builder.environment();
    268       //   - path
     280      // - build up the path
    269281      String path = import_process_env.get("PATH");
    270282      path = gsdlhome + "/ext/parallel-building/bin/script:" + path;
     
    294306      import_process_env.put("HADOOP_PREFIX", hadoop_home);
    295307      fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
     308      */
    296309
    297310      // - change working directory
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest2.java

    r28312 r30354  
    324324      manifest_writer.close();
    325325
     326      /* Original process calling - sets up environment in Java
    326327      // - call Greenstone passing in the path to the manifest
    327328      ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection);
     
    357358      import_process_env.put("HADOOP_PREFIX", hadoop_home);
    358359      fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n");
     360      */
     361
     362      /* New process call - adds call to setup.bash first to prepare
     363       * environment... hopefully */
     364      // - call Greenstone passing in the path to the manifest
     365      String environment_script_filename = "setup.bash";
     366      StringBuffer cmd_buffer = new StringBuffer();
     367      cmd_buffer.append("source ./");
     368      cmd_buffer.append(environment_script_filename);
     369      cmd_buffer.append(" && time -p import.pl -keepold -manifest \"");
     370      cmd_buffer.append(manifest_path.toString());
     371      cmd_buffer.append("\" -archivedir \"");
     372      cmd_buffer.append(conf.get("archivesdir"));
     373      cmd_buffer.append("\" ");
     374      cmd_buffer.append(collection);
     375      ProcessBuilder import_process_builder = new ProcessBuilder("bash", "-c", cmd_buffer.toString());
     376      fw1.write("[Command:" + import_process_builder.command() + "]\n");
    359377
    360378      // - change working directory
     
    738756    job.setReducerClass(GSReducer.class);
    739757
    740     // Sets the input and output handlers - may need to adjust input to provide me
    741     // a series of filenames (TextInputFormat will instead read in a text file and
    742     // return each line...)
     758    // Sets the input and output handlers - may need to adjust input to provide
     759    // a series of filenames (TextInputFormat will instead read in a text file
     760    // and return each line...)
    743761    job.setInputFormatClass(GSFileInputFormat.class);
    744762    //job.setOutputFormatClass(NullOutputFormat.class);
Note: See TracChangeset for help on using the changeset viewer.