Changeset 27001
- Timestamp:
- 2013-03-05T11:47:57+13:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl
r26949 r27001 8 8 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; 9 9 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; 10 die "HDFS HOST not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSHOST'}; 11 die "HDFS PORT not set (set in <gsdl>/ext/parallel_processing/setup.bash)\n" unless defined $ENV{'HDFSPORT'}; 10 12 } 11 13 … … 16 18 my $debug = 0; 17 19 my $dry_run = 0; 18 my $cluster_head = ''; # i.e. 'medusa.local';19 20 20 my $gsdl_home = $ENV{'GSDLHOME'}; 21 my $gsdl_hadoop_ext = $ENV{'GEXT HADOOP_INSTALLED'};21 my $gsdl_hadoop_ext = $ENV{'GEXTPARALLELBUILDING_INSTALLED'}; 22 22 my $hadoop_exe = 'hadoop'; # you may add path 23 my $hdfs_fs_prefix = 'hdfs://localhost:54310'; 23 my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs 24 my $hdfs_fs_prefix = 'hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}; 24 25 my $refresh_import = 0; 25 26 my $username = `whoami`; 26 27 chomp($username); 27 28 29 `rocks > /dev/null 2>&1`; 30 my $is_rocks_cluster = ($? == 0); 31 28 32 # 1. Read and validate parameters 29 33 if (defined $ARGV[0]) … … 47 51 mkdir($gs_results_dir, 0755); 48 52 } 53 my $gs_archives_dir = $gs_collection_dir . '/archives'; 49 54 # - directories within HDFS 50 55 my $hdfs_input_dir = &urlCat($hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import'); … … 73 78 } 74 79 # - clear out the archives regardless 80 if (-e $gs_archives_dir) 81 { 82 &shellCommand('rm -rf "' . $gs_archives_dir . '"'); 83 } 84 mkdir($gs_archives_dir, 0755); 75 85 if (&hdfsTest('d', 0, $hdfs_output_dir)) 76 86 { … … 79 89 print "Done!\n"; 80 90 } 91 # - clear out any old logs 92 if (!&dirIsEmpty('/tmp/greenstone')) 93 { 94 &shellCommand('rm /tmp/greenstone/*.*'); 95 } 96 if ($is_rocks_cluster) 97 { 98 &shellCommand('rocks run host "rm /tmp/greenstone/*.*"'); 99 } 81 100 82 101 # 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer) 83 102 # where we start the server now to ensure it lives on the head node 84 my $server_host = ' localhost';85 my $server_port = ' 8191';103 my $server_host = ''; 104 my $server_port = ''; 86 105 my $configuration_path = $gs_collection_dir . '/etc/collect.cfg'; 87 106 my $infodbtype = `grep -P "^infodbtype" $configuration_path`; … … 106 125 exit; 107 126 } 108 # - use the client tool to add ourselve as a listener127 # - use the client tool to add ourselves as a listener 109 128 print " * Registering as listener... "; 110 129 my $client_command = $server_prefix . "Client.pl " . $server_host . " " . $server_port . " \"#a:" . $$ . "\""; … … 126 145 print " * Running import using Hadoop..."; 127 146 my $hadoop_log = $gs_results_dir . '/hadoop.log'; 128 my $hadoop_command = $hadoop_exe . " jar " . $gsdl_hadoop_ext . "/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest " . $gsdl_home . " ". $collection . " " . $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";147 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . ' "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " . $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 129 148 &shellCommand($hadoop_command); 130 149 print "Done!\n"; … … 147 166 print " * Gathering logs from compute nodes... "; 148 167 # - local files 149 &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir); 168 if (!&dirIsEmpty('/tmp/greenstone')) 169 { 170 &shellCommand('cp /tmp/greenstone/*.* ' . $gs_results_dir); 171 } 150 172 if (-d $gs_collection_dir . '/logs') 151 173 { … … 153 175 } 154 176 # - remote files 155 if ($ cluster_head ne '')156 { 157 &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' $cluster_head . ':' . $gs_results_dir . '"');177 if ($is_rocks_cluster) 178 { 179 &shellCommand('rocks run host "scp /tmp/greenstone/*.* ' . $cluster_head . ':' . $gs_results_dir . '"'); 158 180 &shellCommand('rocks run host "scp /tmp/gsimport-*/logs/*.log ' . $cluster_head . ':' . $gs_results_dir . '"'); 159 181 } … … 163 185 print " * Cleaning up temporary files... "; 164 186 &shellCommand('rm -rf /tmp/greenstone'); 165 if ($ cluster)187 if ($is_rocks_cluster) 166 188 { 167 189 &shellCommand('rocks run host "rm -rf /tmp/greenstone"'); … … 276 298 } 277 299 # /** urlCat() **/ 300 301 # /** 302 # */ 303 sub dirIsEmpty 304 { 305 my $dir = shift(@_); 306 my @files; 307 if (-e $dir) 308 { 309 opendir(DIR, $dir) or die $!; 310 @files = grep { !m/\A\.{1,2}\Z/} readdir(DIR); 311 closedir(DIR); 312 } 313 @files ? 0 : 1; 314 } 315 # /** dirIsEmpty() **/
Note:
See TracChangeset
for help on using the changeset viewer.