Changeset 27001

Show
Ignore:
Timestamp:
05.03.2013 11:47:57 (7 years ago)
Author:
jmt12
Message:

Passing more environment variables (HADOOPPREFIX, HDFSHOST, HDFSPORT) through to hadoop (and thus on to compute nodes). Debug comments. Directory not empty test before attempting to copy log files preventing non-fatal errors. Whether on cluster determined by presence of 'rocks' executable rather than hardcoded hostname (one less thing for me to forget to change between computers)

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r26949 r27001  
    88  die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; 
    99  die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; 
     10  die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'}; 
     11  die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'}; 
    1012} 
    1113 
     
    1618my $debug = 0; 
    1719my $dry_run = 0; 
    18 my $cluster_head = ''; # i.e. 'medusa.local'; 
    19  
    2020my $gsdl_home = $ENV{'GSDLHOME'}; 
    21 my $gsdl_hadoop_ext = $ENV{'GEXTHADOOP_INSTALLED'}; 
     21my $gsdl_hadoop_ext = $ENV{'GEXTPARALLELBUILDING_INSTALLED'}; 
    2222my $hadoop_exe = 'hadoop'; # you may add path 
    23 my $hdfs_fs_prefix = 'hdfs://localhost:54310'; 
     23my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs 
     24my $hdfs_fs_prefix = 'hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}; 
    2425my $refresh_import = 0; 
    2526my $username = `whoami`; 
    2627chomp($username); 
    2728 
     29`rocks > /dev/null 2>&1`; 
     30my $is_rocks_cluster = ($? == 0); 
     31 
    2832# 1. Read and validate parameters 
    2933if (defined $ARGV[0]) 
     
    4751  mkdir($gs_results_dir, 0755); 
    4852} 
     53my $gs_archives_dir = $gs_collection_dir . '/archives'; 
    4954# - directories within HDFS 
    5055my $hdfs_input_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 
     
    7378} 
    7479# - clear out the archives regardless 
     80if (-e $gs_archives_dir) 
     81{ 
     82  &shellCommand('rm -rf "' . $gs_archives_dir . '"'); 
     83} 
     84mkdir($gs_archives_dir, 0755); 
    7585if (&hdfsTest('d', 0, $hdfs_output_dir)) 
    7686{ 
     
    7989  print "Done!\n"; 
    8090} 
     91# - clear out any old logs 
     92if (!&dirIsEmpty('/tmp/greenstone')) 
     93{ 
     94  &shellCommand('rm /tmp/greenstone/*.*'); 
     95} 
     96if ($is_rocks_cluster) 
     97{ 
     98  &shellCommand('rocks run host "rm /tmp/greenstone/*.*"'); 
     99} 
    81100 
    82101# 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer) 
    83102#    where we start the server now to ensure it lives on the head node 
    84 my $server_host = 'localhost'; 
    85 my $server_port = '8191'; 
     103my $server_host = ''; 
     104my $server_port = ''; 
    86105my $configuration_path = $gs_collection_dir . '/etc/collect.cfg'; 
    87106my $infodbtype = `grep -P "^infodbtype" $configuration_path`; 
     
    106125    exit; 
    107126  } 
    108   # - use the client tool to add ourselve as a listener 
     127  # - use the client tool to add ourselves as a listener 
    109128  print " * Registering as listener... "; 
    110129  my $client_command = $server_prefix . "Client.pl " . $server_host . " " . $server_port . " \"#a:" . $$ . "\""; 
     
    126145print " * Running import using Hadoop..."; 
    127146my $hadoop_log = $gs_results_dir . '/hadoop.log'; 
    128 my $hadoop_command = $hadoop_exe . " jar " . $gsdl_hadoop_ext . "/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest " . $gsdl_home . " " . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 
     147my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . ' "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 
    129148&shellCommand($hadoop_command); 
    130149print "Done!\n"; 
     
    147166print " * Gathering logs from compute nodes... "; 
    148167# - local files 
    149 &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir); 
     168if (!&dirIsEmpty('/tmp/greenstone')) 
     169{ 
     170  &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir); 
     171} 
    150172if (-d $gs_collection_dir . '/logs') 
    151173{ 
     
    153175} 
    154176# - remote files 
    155 if ($cluster_head ne '') 
    156 { 
    157   &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' $cluster_head . ':' . $gs_results_dir . '"'); 
     177if ($is_rocks_cluster) 
     178{ 
     179  &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' . $cluster_head . ':' . $gs_results_dir . '"'); 
    158180&shellCommand('rocks run host "scp /tmp/gsimport-*/logs/*.log ' . $cluster_head . ':' . $gs_results_dir . '"'); 
    159181} 
     
    163185print " * Cleaning up temporary files... "; 
    164186&shellCommand('rm -rf /tmp/greenstone'); 
    165 if ($cluster) 
     187if ($is_rocks_cluster) 
    166188{ 
    167189  &shellCommand('rocks run host "rm -rf /tmp/greenstone"'); 
     
    276298} 
    277299# /** urlCat() **/ 
     300 
     301# /** 
     302#  */ 
     303sub dirIsEmpty 
     304{ 
     305  my $dir = shift(@_); 
     306  my @files; 
     307  if (-e $dir) 
     308  { 
     309    opendir(DIR, $dir) or die $!; 
     310    @files = grep { !m/\A\.{1,2}\Z/} readdir(DIR); 
     311    closedir(DIR); 
     312  } 
     313  @files ? 0 : 1; 
     314} 
     315# /** dirIsEmpty() **/