Show
Ignore:
Timestamp:
16.12.2015 16:15:39 (4 years ago)
Author:
jmt12
Message:

Extending manifest v2 support to allow for directories to be listed in manifest. Matched with changes in Directory plugin to allow paths into systems like HDFS to be listed in manifest.cd

Location:
gs2-extensions/parallel-building/trunk/src
Files:
6 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/generate_gantt.pl

    r29663 r30354  
    44use strict; 
    55use warnings; 
     6 
     7BEGIN 
     8{ 
     9    if ( !defined $ENV{'GEXTPARALLELBUILDING_INSTALLED'}) { 
     10    die "GEXTPARALLELBUILDING_INSTALLED not set\n"; 
     11    } 
     12    # Installed CPAN packages for GEXT*INSTALL 
     13    my $perl_version = `perl-version.pl`; 
     14    my $perl_path = sprintf("%s/lib/perl/%s", $ENV{'GEXTPARALLELBUILDING_INSTALLED'}, $perl_version); 
     15    unshift (@INC, $perl_path); 
     16} 
    617 
    718# Modules 
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r28015 r30354  
    3333my $hdfs_fs_prefix = 'HDThriftFS://'; 
    3434my $refresh_import = 0; 
     35my $remove_old = 0; 
    3536my $username = `whoami`; 
    3637chomp($username); 
     
    4950else 
    5051{ 
    51   print STDERR "usage: hadoop_import.pl <collection> [-debug] [-disable_thrift] [-dry_run] [-start_thrift] [-refresh_import] [-flush_diskcache] [-use_nfs] [-stagger]\n"; 
     52  print STDERR "usage: hadoop_import.pl <collection> [-debug] [-enable_thrift] [-dry_run] [-start_thrift] [-refresh_import] [-flush_diskcache] [-use_nfs] [-stagger] [-removeold]\n"; 
    5253  print STDERR "where: [debug] print more debug messages to STDERR\n"; 
    5354  print STDERR "       [dry_run] don't actually perform an file actions\n"; 
     
    6162    $debug = 1; 
    6263  } 
    63   if ($ARGV[$offset] eq '-disable_thrift') 
    64   { 
    65     $use_thrift = 0; 
     64  if ($ARGV[$offset] eq '-enable_thrift') 
     65  { 
     66    $use_thrift = 1; 
    6667  } 
    6768  if ($ARGV[$offset] eq '-dry_run') 
     
    8889  { 
    8990    $use_nfs = 1; 
     91  } 
     92  if ($ARGV[$offset] eq '-removeold') 
     93  { 
     94    $remove_old = 1; 
    9095  } 
    9196  if ($ARGV[$offset] eq '-logdir') 
  • gs2-extensions/parallel-building/trunk/src/perllib/FileUtils/HDFSShell.pm

    r27525 r30354  
    249249} 
    250250## isHDFS() 
     251 
     252 
     253## @function isSpecialDirectory 
     254# 
     255sub isSpecialDirectory 
     256{ 
     257    my ($path) = @_; 
     258    return ($path =~ /^HDFSShell:\/\/[a-zA-Z]+:\d+$/); 
     259} 
     260## isSpecialDirectory() 
    251261 
    252262 
     
    318328  my @files; 
    319329  my $result = &_executeHDFSCommand(1, 'ls', $path); 
     330  if ($result =~ /No such file or directory/) 
     331  { 
     332      print STDERR "BOOM! BOOM! BOOM!\n"; 
     333      return undef; 
     334  } 
    320335  my @lines = split(/\r?\n/, $result); 
    321336  foreach my $line (@lines) 
  • gs2-extensions/parallel-building/trunk/src/perllib/inexport.pm

    r30292 r30354  
    190190    my $self = { 'xml' => 0, 'mode' => $mode }; 
    191191 
    192     print "INFO: This inexport.pm supports version 2 manifest files\n"; 
    193  
    194192    # general options available to all plugins 
    195193    my $arguments = $options->{'args'}; 
     
    201199    print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n"; 
    202200    die "\n"; 
     201    } 
     202 
     203    if ($self->{'verbosity'} > 2) { 
     204    print "[INFO] This inexport.pm supports version 2 manifest files\n"; 
     205    } 
     206    if ($self->{'verbosity'} > 3) { 
     207    print '[DEBUG] Perl @INC: ' . join(", ", @INC) . "\n"; 
    203208    } 
    204209 
     
    735740    else 
    736741    { 
    737       print STDERR "Skipping global file scan due to manifest and complexmeta configuration\n"; 
     742      print STDERR "Skipping import directory-level global file scan due to manifest and complexmeta configuration\n"; 
    738743    } 
    739744 
     
    757762        : &FileUtils::filenameConcatenate($importdir,$df); 
    758763 
    759         if (-d $full_df) { 
     764        if (-d $full_df && $self->{'manifest_version'} != 2) { 
    760765        &add_dir_contents_to_list($full_df, \@full_deleted_files); 
    761766        } else { 
     
    783788        : &FileUtils::filenameConcatenate($importdir,$rf); 
    784789 
    785         if (-d $full_rf) { 
     790        if (-d $full_rf && $self->{'manifest_version'} != 2) { 
    786791        &add_dir_contents_to_list($full_rf, \@full_reindex_files); 
    787792        } else { 
     
    814819        : &FileUtils::filenameConcatenate($importdir,$nf); 
    815820 
    816         if (-d $full_nf) { 
     821        if (-d $full_nf && $self->{'manifest_version'} != 2) { 
    817822        &add_dir_contents_to_list($full_nf, \@full_new_files); 
    818823        } else { 
     
    885890        foreach my $file_to_import (keys %{$block_hash->{'reindex_files'}}, keys %{$block_hash->{'new_files'}}) 
    886891        { 
    887         $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli); 
     892        if (&FileUtils::directoryExists($file_to_import)) { 
     893#       print "DEBUG: Directory to import: \"" . $file_to_import . "\"\n"; 
     894        &plugin::file_block_read($pluginfo, '', $file_to_import, $block_hash, $metadata, $gli); 
     895#       print "\n===== BLOCK HASH =====\n"; 
     896#       Dump($block_hash); 
     897#       print "\n=====            =====\n\n"; 
     898        $self->perform_process_files($manifest, $pluginfo, $importdir, $file_to_import, $block_hash, $metadata, $processor, $maxdocs); 
     899        } 
     900        else 
     901        { 
     902#       print "DEBUG: File to import: \"" . $file_to_import . "\"\n"; 
     903        $self->{'directoryplugin'}->read_for_manifest_v2($pluginfo, $file_to_import, $block_hash, $processor, $gli); 
     904        } 
    888905        } 
    889906      } 
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest.java

    r27654 r30354  
    262262 
    263263      // - call Greenstone passing in the path to the manifest 
    264       ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection); 
     264      //ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection); 
     265      String environment_script_filename = "setup.bash"; 
     266      StringBuffer cmd_buffer = new StringBuffer(); 
     267      cmd_buffer.append("source ./"); 
     268      cmd_buffer.append(environment_script_filename); 
     269      cmd_buffer.append(" && time -p import.pl -keepold -manifest \""); 
     270      cmd_buffer.append(manifest_path.toString()); 
     271      cmd_buffer.append("\" -archivedir \""); 
     272      cmd_buffer.append(conf.get("archivesdir")); 
     273      cmd_buffer.append("\" "); 
     274      cmd_buffer.append(collection); 
     275      ProcessBuilder import_process_builder = new ProcessBuilder("bash", "-c", cmd_buffer.toString()); 
    265276      fw1.write("[Command:" + import_process_builder.command() + "]\n"); 
     277      /* 
    266278      // - alter environment 
    267279      Map<String, String> import_process_env = import_process_builder.environment(); 
    268       //   - path 
     280      // - build up the path 
    269281      String path = import_process_env.get("PATH"); 
    270282      path = gsdlhome + "/ext/parallel-building/bin/script:" + path; 
     
    294306      import_process_env.put("HADOOP_PREFIX", hadoop_home); 
    295307      fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n"); 
     308      */ 
    296309 
    297310      // - change working directory 
  • gs2-extensions/parallel-building/trunk/src/src/java/org/nzdl/gsdl/HadoopGreenstoneIngest2.java

    r28312 r30354  
    324324      manifest_writer.close(); 
    325325 
     326      /* Original process calling - sets up environment in Java 
    326327      // - call Greenstone passing in the path to the manifest 
    327328      ProcessBuilder import_process_builder = new ProcessBuilder("time", "-p", "import.pl", "-manifest", manifest_path.toString(), "-keepold", "-archivedir", conf.get("archivesdir"), collection); 
     
    357358      import_process_env.put("HADOOP_PREFIX", hadoop_home); 
    358359      fw1.write("[HADOOP_PREFIX: " + hadoop_home + "]\n"); 
     360      */ 
     361 
     362      /* New process call - adds call to setup.bash first to prepare 
     363       * environment... hopefully */ 
     364      // - call Greenstone passing in the path to the manifest 
     365      String environment_script_filename = "setup.bash"; 
     366      StringBuffer cmd_buffer = new StringBuffer(); 
     367      cmd_buffer.append("source ./"); 
     368      cmd_buffer.append(environment_script_filename); 
     369      cmd_buffer.append(" && time -p import.pl -keepold -manifest \""); 
     370      cmd_buffer.append(manifest_path.toString()); 
     371      cmd_buffer.append("\" -archivedir \""); 
     372      cmd_buffer.append(conf.get("archivesdir")); 
     373      cmd_buffer.append("\" "); 
     374      cmd_buffer.append(collection); 
     375      ProcessBuilder import_process_builder = new ProcessBuilder("bash", "-c", cmd_buffer.toString()); 
     376      fw1.write("[Command:" + import_process_builder.command() + "]\n"); 
    359377 
    360378      // - change working directory 
     
    738756    job.setReducerClass(GSReducer.class); 
    739757 
    740     // Sets the input and output handlers - may need to adjust input to provide me 
    741     // a series of filenames (TextInputFormat will instead read in a text file and 
    742     // return each line...) 
     758    // Sets the input and output handlers - may need to adjust input to provide 
     759    // a series of filenames (TextInputFormat will instead read in a text file 
     760    // and return each line...) 
    743761    job.setInputFormatClass(GSFileInputFormat.class); 
    744762    //job.setOutputFormatClass(NullOutputFormat.class);