Ignore:
Timestamp:
2013-05-24T09:24:16+12:00 (11 years ago)
Author:
jmt12
Message:

Allowing more processing arguments to be configured at the call, and passing information such as the desired HDFS driver through to Hadoop processing

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r27126 r27414  
    1616# 0. Init
    1717my $collection = 'test';
     18my $use_thrift = 1;
    1819my $debug = 0;
    1920my $dry_run = 0;
     
    2223my $hadoop_exe = 'hadoop'; # you may add path
    2324my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs
    24 my $hdfs_fs_prefix = 'hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'};
     25my $hdfs_fs_prefix = 'HDThriftFS://';
    2526my $refresh_import = 0;
    2627my $username = `whoami`;
     
    3536  $collection = $ARGV[0];
    3637}
     38else
     39{
     40  print STDERR "usage: hadoop_import.pl <collection> [-debug] [-dry_run] [-disable_thrift] [-refresh_import] \n\n";
     41}
     42my $offset = 1;
     43while (defined $ARGV[$offset])
     44{
     45  if ($ARGV[$offset] eq '-debug')
     46  {
     47    $debug = 1;
     48  }
     49  if ($ARGV[$offset] eq '-disable_thrift')
     50  {
     51    $use_thrift = 0;
     52  }
     53  if ($ARGV[$offset] eq '-dry_run')
     54  {
     55    $dry_run = 1;
     56  }
     57  if ($ARGV[$offset] eq '-refresh_import')
     58  {
     59    $refresh_import = 1;
     60  }
     61  $offset++;
     62}
     63
     64if (!$use_thrift)
     65{
     66  $hdfs_fs_prefix = 'HDFSShell://';
     67}
     68
    3769my $gs_collection_dir = $gsdl_home . '/collect/' . $collection;
    3870my $gs_import_dir = $gs_collection_dir . '/import';
     
    5385my $gs_archives_dir = $gs_collection_dir . '/archives';
    5486# - directories within HDFS
    55 my $hdfs_input_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import');
    56 my $hdfs_output_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'archives');
     87my $hdfs_input_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'import');
     88my $hdfs_output_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'archives');
    5789
    5890# 2. Copy the import directory into HDFS
     
    103135# - flush DNS cache too, so we are playing on a level field
    104136&shellCommand('flush_caches.pl');
    105 &shellCommand('rocks run host "flush_caches.pl"');
     137if ($is_rocks_cluster)
     138{
     139  &shellCommand('rocks run host "flush_caches.pl"');
     140}
    106141
    107142# 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer)
     
    151186print " * Running import using Hadoop...";
    152187my $hadoop_log = $gs_results_dir . '/hadoop.log';
    153 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . ' "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";
     188my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $hdfs_fs_prefix . '" "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";
    154189&shellCommand($hadoop_command);
    155190print "Done!\n";
     
    223258  my $paths = '"' . join('" "', @_) . '"';
    224259  my $hdfs_command = $hadoop_exe . ' fs -' . $command . ' ' . $paths . ' 2>&1';
     260  if ($debug)
     261  {
     262    print STDERR "[DEBUG] $hdfs_command\n";
     263  }
    225264  &shellCommand($hdfs_command);
    226265  return $?;
Note: See TracChangeset for help on using the changeset viewer.