Changeset 27414

Show
Ignore:
Timestamp:
24.05.2013 09:24:16 (6 years ago)
Author:
jmt12
Message:

Allowing more processing arguments to be configured at the call, and passing information such as the desired HDFS driver through to Hadoop processing

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r27126 r27414  
    1616# 0. Init 
    1717my $collection = 'test'; 
     18my $use_thrift = 1; 
    1819my $debug = 0; 
    1920my $dry_run = 0; 
     
    2223my $hadoop_exe = 'hadoop'; # you may add path 
    2324my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs 
    24 my $hdfs_fs_prefix = 'hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}; 
     25my $hdfs_fs_prefix = 'HDThriftFS://'; 
    2526my $refresh_import = 0; 
    2627my $username = `whoami`; 
     
    3536  $collection = $ARGV[0]; 
    3637} 
     38else 
     39{ 
     40  print STDERR "usage: hadoop_import.pl <collection> [-debug] [-dry_run] [-disable_thrift] [-refresh_import] \n\n"; 
     41} 
     42my $offset = 1; 
     43while (defined $ARGV[$offset]) 
     44{ 
     45  if ($ARGV[$offset] eq '-debug') 
     46  { 
     47    $debug = 1; 
     48  } 
     49  if ($ARGV[$offset] eq '-disable_thrift') 
     50  { 
     51    $use_thrift = 0; 
     52  } 
     53  if ($ARGV[$offset] eq '-dry_run') 
     54  { 
     55    $dry_run = 1; 
     56  } 
     57  if ($ARGV[$offset] eq '-refresh_import') 
     58  { 
     59    $refresh_import = 1; 
     60  } 
     61  $offset++; 
     62} 
     63 
     64if (!$use_thrift) 
     65{ 
     66  $hdfs_fs_prefix = 'HDFSShell://'; 
     67} 
     68 
    3769my $gs_collection_dir = $gsdl_home . '/collect/' . $collection; 
    3870my $gs_import_dir = $gs_collection_dir . '/import'; 
     
    5385my $gs_archives_dir = $gs_collection_dir . '/archives'; 
    5486# - directories within HDFS 
    55 my $hdfs_input_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 
    56 my $hdfs_output_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 
     87my $hdfs_input_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 
     88my $hdfs_output_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 
    5789 
    5890# 2. Copy the import directory into HDFS 
     
    103135# - flush DNS cache too, so we are playing on a level field 
    104136&shellCommand('flush_caches.pl'); 
    105 &shellCommand('rocks run host "flush_caches.pl"'); 
     137if ($is_rocks_cluster) 
     138{ 
     139  &shellCommand('rocks run host "flush_caches.pl"'); 
     140} 
    106141 
    107142# 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer) 
     
    151186print " * Running import using Hadoop..."; 
    152187my $hadoop_log = $gs_results_dir . '/hadoop.log'; 
    153 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . ' "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 
     188my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $hdfs_fs_prefix . '" "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 
    154189&shellCommand($hadoop_command); 
    155190print "Done!\n"; 
     
    223258  my $paths = '"' . join('" "', @_) . '"'; 
    224259  my $hdfs_command = $hadoop_exe . ' fs -' . $command . ' ' . $paths . ' 2>&1'; 
     260  if ($debug) 
     261  { 
     262    print STDERR "[DEBUG] $hdfs_command\n"; 
     263  } 
    225264  &shellCommand($hdfs_command); 
    226265  return $?;