Changeset 27644 for gs2-extensions/parallel-building
- Timestamp:
- 2013-06-18T10:31:34+12:00 (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/bin/script/hadoop_import.pl
r27594 r27644 6 6 BEGIN 7 7 { 8 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};8 die "GSDLHOME not set\n" unless (defined $ENV{'GSDLHOME'} && $ENV{'GSDLHOME'} ne ''); 9 9 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; 10 10 die "GEXTPARALLELBUILDING not set\n" unless defined $ENV{'GEXTPARALLELBUILDING'}; … … 19 19 my $collection = 'test'; 20 20 my $use_thrift = 1; 21 my $start_thrift = 1;21 my $start_thrift = 0; 22 22 my $debug = 0; 23 23 my $dry_run = 0; 24 my $flush_diskcache = 0; 25 my $use_nfs = 0; 26 24 27 my $gsdl_home = $ENV{'GSDLHOME'}; 25 28 my $gsdl_hadoop_ext = $ENV{'GEXTPARALLELBUILDING_INSTALLED'}; … … 35 38 36 39 # 1. Read and validate parameters 37 if (defined $ARGV[0] )40 if (defined $ARGV[0] && $ARGV[0] =~ /^[a-z0-9]+$/i) 38 41 { 39 42 $collection = $ARGV[0]; … … 41 44 else 42 45 { 43 print STDERR "usage: hadoop_import.pl <collection> [-debug] [-dry_run] [-start_thrift] [-disable_thrift] [-refresh_import] \n\n"; 46 print STDERR "usage: hadoop_import.pl <collection> [-debug] [-dry_run] [-start_thrift] [-disable_thrift] [-refresh_import] [-flush_diskcache] [-use_nfs]\n\n"; 47 exit; 44 48 } 45 49 my $offset = 1; … … 62 66 $refresh_import = 1; 63 67 } 68 if ($ARGV[$offset] eq '-flush_diskcache') 69 { 70 $flush_diskcache = 1; 71 } 72 if ($ARGV[$offset] eq '-start_thrift') 73 { 74 $start_thrift = 1; 75 } 76 if ($ARGV[$offset] eq '-use_nfs') 77 { 78 $use_nfs = 1; 79 } 64 80 $offset++; 65 81 } … … 68 84 { 69 85 $hdfs_fs_prefix = 'HDFSShell://'; 86 } 87 if ($use_nfs) 88 { 89 $hdfs_fs_prefix = '/hdfs'; 70 90 } 71 91 … … 87 107 } 88 108 # - directories within HDFS 89 #my $hdfs_input_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 90 my $hdfs_input_dir = &urlCat('hdfs://', 'user', $username, 'gsdl', 'collect', $collection, 'import'); 91 #my $hdfs_output_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 92 my $hdfs_output_dir = &urlCat('hdfs://', 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 109 my $hdfs_input_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'import'); 110 print "HDFS Import Directory: " . $hdfs_input_dir . "\n"; 111 my $nfs_input_dir = &urlCat('/hdfs', 'user', $username, 'gsdl', 'collect', $collection, 'import'); 112 if ($use_nfs) 113 { 114 print "=> NFS Import Directory: " . $nfs_input_dir . "\n"; 115 } 116 my $hdfs_output_dir = &urlCat('hdfs://' . $ENV{'HDFSHOST'} . ':' . $ENV{'HDFSPORT'}, 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 117 print "HDFS Archives Directory: " . $hdfs_output_dir . "\n"; 118 my $nfs_output_dir = &urlCat('/hdfs', 'user', $username, 'gsdl', 'collect', $collection, 'archives'); 119 if ($use_nfs) 120 { 121 print "=> NFS Archives Directory: " . $nfs_output_dir . "\n"; 122 } 93 123 94 124 # 2. Copy the import directory into HDFS 95 125 print " * Replicating import directory in HDFS..."; 96 126 # - check if import directory already exists 97 my $hdfs_import_exists = &hdfsTest('d', 0, $hdfs_input_dir); 127 my $hdfs_import_exists = 0; 128 if ($use_nfs) 129 { 130 if (-d $nfs_input_dir) 131 { 132 $hdfs_import_exists = 1; 133 } 134 } 135 else 136 { 137 $hdfs_import_exists = &hdfsTest('d', 0, $hdfs_input_dir); 138 } 98 139 if ($refresh_import || !$hdfs_import_exists) 99 140 { … … 101 142 if ($hdfs_import_exists) 102 143 { 103 &hdfsCommand('rmr', $hdfs_input_dir); 144 if ($use_nfs) 145 { 146 &recursiveDelete($nfs_input_dir, '/hdfs'); 147 } 148 else 149 { 150 &hdfsCommand('rmr', $hdfs_input_dir); 151 } 104 152 } 105 153 # - now recursively copy the contents of import directory into HDFS ensuring 106 154 # that relative paths are maintained 107 my $file_count = &recursiveCopy($gs_import_dir, $hdfs_input_dir); 155 my $file_count = 0; 156 if ($use_nfs) 157 { 158 $file_count = &recursiveCopy($gs_import_dir, $nfs_input_dir); 159 } 160 else 161 { 162 $file_count = &recursiveCopy($gs_import_dir, $hdfs_input_dir); 163 } 108 164 &debugPrint($file_count . " files 'putted'"); 109 165 print "Done!\n"; … … 113 169 print "Already exists!\n"; 114 170 } 171 115 172 # - clear out the archives regardless 116 173 my $gs_archives_dir = $gs_collection_dir . '/archives'; … … 119 176 { 120 177 print " * Clearing existing archives directory for this collection... "; 121 & shellCommand('rm -rf "' . $gs_archives_dir . '"');178 &recursiveDelete($gs_archives_dir, $gsdl_home); 122 179 $deleted_archives = 1; 123 180 } 124 181 mkdir($gs_archives_dir, 0755); 125 if (&hdfsTest('d', 0, $hdfs_output_dir)) 182 my $hdfs_archives_exists = 0; 183 if ($use_nfs) 184 { 185 if (-d $nfs_output_dir) 186 { 187 $hdfs_archives_exists = 1; 188 } 189 } 190 else 191 { 192 $hdfs_archives_exists = &hdfsTest('d', 0, $hdfs_output_dir) 193 } 194 if ($hdfs_archives_exists) 126 195 { 127 196 if (!$deleted_archives) … … 129 198 print " * Clearing existing archives directory for this collection... "; 130 199 } 131 &hdfsCommand('rmr', $hdfs_output_dir); 200 if ($use_nfs) 201 { 202 &recursiveDelete($nfs_output_dir, '/hdfs'); 203 } 204 else 205 { 206 &hdfsCommand('rmr', $hdfs_output_dir); 207 } 132 208 $deleted_archives = 1; 133 209 } … … 136 212 print "Done!\n"; 137 213 } 214 138 215 # - watch for cached directories for Media based collections 139 216 my $gs_cached_dir = $gs_collection_dir . '/cached'; … … 141 218 { 142 219 print " * Clearing existing cached media directory for this collection... "; 143 & shellCommand('rm -rf "' . $gs_cached_dir . '"');220 &recursiveDelete($gs_cached_dir, $gsdl_home); 144 221 print "Done!\n"; 145 222 } … … 150 227 if (!&dirIsEmpty($gs_logs_dir)) 151 228 { 152 & shellCommand('rm -f ' . $gs_logs_dir . '/*.*');229 &recursiveDelete($gs_logs_dir . '/*.*', $gsdl_home); 153 230 } 154 231 if (!&dirIsEmpty('/tmp/greenstone')) … … 156 233 &shellCommand('rm -f /tmp/greenstone/*.*'); 157 234 &shellCommand('rm -rf /tmp/gsimport*'); 235 &shellCommand('rm -rf /tmp/thrift'); 158 236 } 159 237 if ($is_rocks_cluster) … … 161 239 &shellCommand('rocks run host "rm -f /tmp/greenstone/*.*"'); 162 240 &shellCommand('rocks run host "rm -rf /tmp/gsimport*"'); 241 &shellCommand('rocks run host "rm -rf /tmp/thrift"'); 163 242 } 164 243 print "Done!\n"; 165 244 166 245 # - flush DNS cache too, so we are playing on a level field 167 print " * Flushing disk cache... "; 168 &shellCommand('flush_caches.pl'); 169 if ($is_rocks_cluster) 170 { 171 &shellCommand('rocks run host "flush_caches.pl"'); 172 } 173 print "Done!\n"; 246 if ($flush_diskcache) 247 { 248 print " * Flushing disk cache... "; 249 &shellCommand('flush_caches.pl'); 250 if ($is_rocks_cluster) 251 { 252 &shellCommand('rocks run host "flush_caches.pl"'); 253 } 254 print "Done!\n"; 255 } 174 256 175 257 # 3. Special case for *Server type infodbs (namely TDBServer and GDBMServer) … … 218 300 { 219 301 print " * Starting Thrift Servers (on compute nodes)... "; 220 &shellCommand('rocks run host "cd ' . $ENV{'GEXTPARALLELBUILDING'} . '/packages/ThriftFS-0.9.0/bin && ./thriftctl.sh start"');302 print "[DEBUG]\n" . &shellCommand('rocks run host "cd ' . $ENV{'GEXTPARALLELBUILDING'} . '/packages/ThriftFS-0.9.0/bin && ./thriftctl.sh start"') . "\n\n"; 221 303 } 222 304 # single server … … 227 309 } 228 310 print "Done!\n"; 311 } 312 313 my $actual_archives_dir; 314 if ($use_nfs) 315 { 316 $actual_archives_dir = $nfs_output_dir; 317 } 318 else 319 { 320 $actual_archives_dir = $hdfs_output_dir; 321 $actual_archives_dir =~ s/hdfs:\/\//$hdfs_fs_prefix/; 229 322 } 230 323 … … 236 329 print " * Running import using Hadoop..."; 237 330 my $hadoop_log = $gs_results_dir . '/hadoop.log'; 238 &shellCommand('echo "host:$HDFSHOST" > ' . $hadoop_log); 239 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest "' . $gsdl_home . '" "' . $hdfs_fs_prefix . '" "' . $ENV{'HADOOP_PREFIX'} . '" ' . $collection . ' "' . $hdfs_input_dir . '" "' . $hdfs_output_dir . '" >> ' . $hadoop_log . ' 2>&1'; 331 &shellCommand('echo "host:' . $ENV{'HDFSHOST'} . '" > ' . $hadoop_log); 332 my $hadoop_command = $hadoop_exe . ' jar ' . $gsdl_hadoop_ext . '/lib/hadoop-greenstone.jar org.nzdl.gsdl.HadoopGreenstoneIngest '; 333 $hadoop_command .= '"' . $gsdl_home . '" '; # Greenstone's home dir 334 $hadoop_command .= '"' . $ENV{'HADOOP_PREFIX'} . '" '; # Hadoop's home dir 335 $hadoop_command .= $collection . ' '; # The collection name 336 $hadoop_command .= '"' . $actual_archives_dir . '" '; # Collection archive dir 337 $hadoop_command .= '"' . $hdfs_fs_prefix . '" '; # Prefix for talking to HDFS (driver) 338 $hadoop_command .= '"' . $hdfs_input_dir . '" '; # HDFS in 339 $hadoop_command .= '"' . $hdfs_output_dir . '" '; # HDFS out 340 $hadoop_command .= ' >> ' . $hadoop_log . ' 2>&1'; # Redirect to log 240 341 &shellCommand($hadoop_command); 241 342 print "Done!\n"; … … 351 452 # /** printUsage() **/ 352 453 353 # /** 354 # */ 454 455 ## @function recursiveCopy() 456 # 355 457 sub recursiveCopy 356 458 { … … 358 460 my $file_count = 0; 359 461 # - create the directory in HDFS 360 &hdfsCommand('mkdir', $hdfs_dir); 462 if ($use_nfs) 463 { 464 &shellCommand('mkdir "' . $hdfs_dir . '"'); 465 } 466 else 467 { 468 &hdfsCommand('mkdir', $hdfs_dir); 469 } 361 470 # - search $src_dir for files 362 471 opendir(DH, $src_dir) or die("Error! Cannot open directory for reading: " . $src_dir); … … 379 488 { 380 489 my $hdfs_path = $hdfs_dir . '/' . $file; 381 &hdfsCommand('put', $src_path, $hdfs_path); 490 if ($use_nfs) 491 { 492 &shellCommand('cp "' . $src_path . '" "' . $hdfs_path . '"'); 493 } 494 else 495 { 496 &hdfsCommand('put', $src_path, $hdfs_path); 497 } 382 498 $file_count++; 383 499 } … … 386 502 return $file_count; 387 503 } 388 # /** recursiveCopy() **/ 504 ## recursiveCopy() ## 505 389 506 390 507 # /** @function shellCommand … … 427 544 } 428 545 # /** dirIsEmpty() **/ 546 547 548 ## @function recursiveDelete() 549 # 550 sub recursiveDelete 551 { 552 my ($dir, $prefix) = @_; 553 if ($dir =~ /^$prefix/) 554 { 555 &shellCommand('rm -rf "' . $dir . '"'); 556 } 557 } 558 ## recursiveDelete() ##
Note:
See TracChangeset
for help on using the changeset viewer.