Changeset 27414 for gs2-extensions/parallel-building/trunk
- Timestamp:
- 2013-05-24T09:24:16+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl
r27126 r27414 16 16 # 0. Init 17 17 my $collection = 'test'; 18 my $use_thrift = 1; 18 19 my $debug = 0; 19 20 my $dry_run = 0; … … 22 23 my $hadoop_exe = 'hadoop'; # you may add path 23 24 my $cluster_head = $ENV{'HDFSHOST'}; # may not be true on advanced configs 24 my $hdfs_fs_prefix = ' hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'};25 my $hdfs_fs_prefix = 'HDThriftFS://'; 25 26 my $refresh_import = 0; 26 27 my $username = `whoami`; … … 35 36 $collection = $ARGV[0]; 36 37 } 38 else 39 { 40 print STDERR "usage: hadoop_import.pl <collection> [-debug] [-dry_run] [-disable_thrift] [-refresh_import] \n\n"; 41 } 42 my $offset = 1; 43 while (defined $ARGV[$offset]) 44 { 45 if ($ARGV[$offset] eq '-debug') 46 { 47 $debug = 1; 48 } 49 if ($ARGV[$offset] eq '-disable_thrift') 50 { 51 $use_thrift = 0; 52 } 53 if ($ARGV[$offset] eq '-dry_run') 54 { 55 $dry_run = 1; 56 } 57 if ($ARGV[$offset] eq '-refresh_import') 58 { 59 $refresh_import = 1; 60 } 61 $offset++; 62 } 63 64 if (!$use_thrift) 65 { 66 $hdfs_fs_prefix = 'HDFSShell://'; 67 } 68 37 69 my $gs_collection_dir = $gsdl_home . '/collect/' . $collection; 38 70 my $gs_import_dir = $gs_collection_dir . '/import'; … … 53 85 my $gs_archives_dir = $gs_collection_dir . '/archives'; 54 86 # - directories within HDFS 55 my $hdfs_input_dir = &urlCat( $hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'import');56 my $hdfs_output_dir = &urlCat( $hdfs_fs_prefix, 'user', $username, 'gsdl', 'collect', $collection, 'archives');87 my $hdfs_input_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 88 my $hdfs_output_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 57 89 58 90 # 2. Copy the import directory into HDFS … … 103 135 # - flush DNS cache too, so we are playing on a level field 104 136 &shellCommand('flush_caches.pl'); 105 &shellCommand('rocks run host "flush_caches.pl"'); 137 if ($is_rocks_cluster) 138 { 139 &shellCommand('rocks run host "flush_caches.pl"'); 140 } 106 141 107 142 # 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer) … … 151 186 print " * Running import using Hadoop..."; 152 187 my $hadoop_log = $gs_results_dir . '/hadoop.log'; 153 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $ ENV{'HDFSHOST'} . '" ' . $ENV{'HDFSPORT'} . '"' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " . $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1";188 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $hdfs_fs_prefix . '" "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . " " . $hdfs_input_dir . " " . $hdfs_output_dir . " > " . $hadoop_log . " 2>&1"; 154 189 &shellCommand($hadoop_command); 155 190 print "Done!\n"; … … 223 258 my $paths = '"' . join('" "', @_) . '"'; 224 259 my $hdfs_command = $hadoop_exe . ' fs -' . $command . ' ' . $paths . ' 2>&1'; 260 if ($debug) 261 { 262 print STDERR "[DEBUG] $hdfs_command\n"; 263 } 225 264 &shellCommand($hdfs_command); 226 265 return $?;
Note:
See TracChangeset
for help on using the changeset viewer.