Changeset 27001


Ignore:
Timestamp:
2013-03-05T11:47:57+13:00 (11 years ago)
Author:
jmt12
Message:

Passing more environment variables (HADOOPPREFIX, HDFSHOST, HDFSPORT) through to hadoop (and thus on to compute nodes). Debug comments. Directory not empty test before attempting to copy log files preventing non-fatal errors. Whether on cluster determined by presence of 'rocks' executable rather than hardcoded hostname (one less thing for me to forget to change between computers)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl

    r26949 r27001  
    88  die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
    99  die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
     10  die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'};
     11  die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'};
    1012}
    1113
     
    1618my $debug = 0;
    1719my $dry_run = 0;
    18 my $cluster_head = ''; # i.e. 'medusa.local';
    19 
    2020my $gsdl_home = $ENV{'GSDLHOME'};
    21 my $gsdl_hadoop_ext = $ENV{'GEXTHADOOP_INSTALLED'};
     21my $gsdl_hadoop_ext = $ENV{'GEXTPARALLELBUILDING_INSTALLED'};
    2222my $hadoop_exe = 'hadoop'; # you may add path
    23 my $hdfs_fs_prefix = 'hdfs://localhost:54310';
     23my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs
     24my $hdfs_fs_prefix = 'hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'};
    2425my $refresh_import = 0;
    2526my $username = `whoami`;
    2627chomp($username);
    2728
     29`rocks > /dev/null 2>&1`;
     30my $is_rocks_cluster = ($? == 0);
     31
    2832# 1. Read and validate parameters
    2933if (defined $ARGV[0])
     
    4751  mkdir($gs_results_dir, 0755);
    4852}
     53my $gs_archives_dir = $gs_collection_dir . '/archives';
    4954# - directories within HDFS
    5055my $hdfs_input_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import');
     
    7378}
    7479# - clear out the archives regardless
     80if (-e $gs_archives_dir)
     81{
     82  &shellCommand('rm -rf "' . $gs_archives_dir . '"');
     83}
     84mkdir($gs_archives_dir, 0755);
    7585if (&hdfsTest('d', 0, $hdfs_output_dir))
    7686{
     
    7989  print "Done!\n";
    8090}
     91# - clear out any old logs
     92if (!&dirIsEmpty('/tmp/greenstone'))
     93{
     94  &shellCommand('rm /tmp/greenstone/*.*');
     95}
     96if ($is_rocks_cluster)
     97{
     98  &shellCommand('rocks run host "rm /tmp/greenstone/*.*"');
     99}
    81100
    82101# 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer)
    83102#    where we start the server now to ensure it lives on the head node
    84 my $server_host = 'localhost';
    85 my $server_port = '8191';
     103my $server_host = '';
     104my $server_port = '';
    86105my $configuration_path = $gs_collection_dir . '/etc/collect.cfg';
    87106my $infodbtype = `grep -P "^infodbtype" $configuration_path`;
     
    106125    exit;
    107126  }
    108   # - use the client tool to add ourselve as a listener
     127  # - use the client tool to add ourselves as a listener
    109128  print " * Registering as listener... ";
    110129  my $client_command = $server_prefix . "Client.pl " . $server_host . " " . $server_port . " \"#a:" . $$ . "\"";
     
    126145print " * Running import using Hadoop...";
    127146my $hadoop_log = $gs_results_dir . '/hadoop.log';
    128 my $hadoop_command = $hadoop_exe . " jar " . $gsdl_hadoop_ext . "/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest " . $gsdl_home . " " . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";
     147my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . ' "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " .  $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";
    129148&shellCommand($hadoop_command);
    130149print "Done!\n";
     
    147166print " * Gathering logs from compute nodes... ";
    148167# - local files
    149 &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir);
     168if (!&dirIsEmpty('/tmp/greenstone'))
     169{
     170  &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir);
     171}
    150172if (-d $gs_collection_dir . '/logs')
    151173{
     
    153175}
    154176# - remote files
    155 if ($cluster_head ne '')
    156 {
    157   &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' $cluster_head . ':' . $gs_results_dir . '"');
     177if ($is_rocks_cluster)
     178{
     179  &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' . $cluster_head . ':' . $gs_results_dir . '"');
    158180&shellCommand('rocks run host "scp /tmp/gsimport-*/logs/*.log ' . $cluster_head . ':' . $gs_results_dir . '"');
    159181}
     
    163185print " * Cleaning up temporary files... ";
    164186&shellCommand('rm -rf /tmp/greenstone');
    165 if ($cluster)
     187if ($is_rocks_cluster)
    166188{
    167189  &shellCommand('rocks run host "rm -rf /tmp/greenstone"');
     
    276298}
    277299# /** urlCat() **/
     300
     301# /**
     302#  */
     303sub dirIsEmpty
     304{
     305  my $dir = shift(@_);
     306  my @files;
     307  if (-e $dir)
     308  {
     309    opendir(DIR, $dir) or die $!;
     310    @files = grep { !m/\A\.{1,2}\Z/} readdir(DIR);
     311    closedir(DIR);
     312  }
     313  @files ? 0 : 1;
     314}
     315# /** dirIsEmpty() **/
Note: See TracChangeset for help on using the changeset viewer.