Changeset 26242
- Timestamp:
- 2012-09-25T12:41:51+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/bin/script/parallel_terrier_fileindex.pl
r26187 r26242 66 66 if ($debug) 67 67 { 68 print STDERR '[ debug] ' . $message;68 print STDERR '[SDEBUG] ' . $message; 69 69 } 70 70 } … … 100 100 sub main 101 101 { 102 print '[' . time() . '] Parallel FileIndexer Started: ' . localtime() . "\n"; 103 104 # 1. Initialization 105 my $class_name = 'org.terrier.applications.FileIndexer'; 106 my $worker_count = 0; 107 my $terrier_home = ''; 108 my $collection_path = ''; 109 my $batch_size = 0; 110 my $debug = 0; 111 # - parse arguments 112 my $argument; 113 for (my $i = 0; $i < scalar(@ARGV); $i++) 114 { 115 $argument = $ARGV[$i]; 116 if ('-workers' eq $argument) 117 { 118 $i++; 119 $worker_count = $ARGV[$i]; 120 } 121 elsif ('-terrier' eq $argument) 122 { 123 $i++; 124 $terrier_home = $ARGV[$i]; 125 } 126 elsif ('-collection' eq $argument) 127 { 128 $i++; 129 $collection_path = $ARGV[$i]; 130 } 131 elsif ('-batchsize' eq $argument) 132 { 133 $i++; 134 $batch_size = $ARGV[$i]; 135 } 136 elsif ('-debug' eq $argument) 137 { 138 $debug = 1; 139 } 140 else 141 { 142 &printUsage('Unrecognized argument: ' . $argument); 143 } 144 } 145 print 'Worker Count: ' . $worker_count . "\n"; 146 print 'Terrier Home: ' . $terrier_home . "\n"; 147 print 'Collection: ' . $collection_path . "\n"; 148 print 'Batch Size: ' . $batch_size . "\n"; 149 print 'Debug: ' . $debug . "\n"; 150 # - check arguments 151 if ($worker_count !~ /^\d+$/) 152 { 153 &printUsage('Worker count must be an integer'); 154 } 155 if ('' eq $terrier_home || !-d $terrier_home) 156 { 157 &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory'); 158 } 159 if ('' eq $collection_path || !-d $collection_path) 160 { 161 &printUsage('Collection path given doesn\'t exist or isn\'t a directory'); 162 } 163 if ($batch_size !~ /^\d+$/) 164 { 165 &printUsage('Batch size count must be an integer'); 166 } 167 if (0 == $worker_count || 0 == $batch_size) 168 { 169 print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n"; 170 $batch_size = 0; 171 } 172 # - derived variables 173 my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh'); 174 175 176 # 2. Remove any existing index 177 print STDOUT " * Removing any old index\n"; 178 my $var_path = &fileCat($terrier_home, 'var'); 179 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!); 180 my @old_files = readdir(DH); 181 closedir(DH); 182 my $old_file; 183 foreach $old_file (@old_files) 184 { 185 if ($old_file =~ /^manifest-\d+.spec/) 186 { 187 my $old_path = &fileCat($var_path, $old_file); 188 &debugPrint($debug, 'deleting ' . $old_path . "\n"); 189 unlink($old_path); 190 } 191 } 192 my $index_path = &fileCat($var_path, 'index'); 193 my $delete_command = 'rm -rf "' . $index_path . '"'; 194 &debugPrint($debug, 'command: ' . $delete_command . "\n"); 195 `$delete_command`; 196 my $assoc_path = &fileCat($var_path, 'share', 'images', 'assoc'); 197 $delete_command = 'rm -rf "' . $assoc_path . '"'; 198 &debugPrint($debug, 'command: ' . $delete_command . "\n"); 199 `$delete_command`; 200 print STDOUT ' - cleanup complete!' . "\n"; 201 202 # 3. Prepare the collection for parallel indexing 203 print STDOUT " * Prepare collection for indexing\n"; 102 print '[SCRIPT:' . time() . "] Starting Parallel FileIndexer\n"; 103 104 # 1. Initialization 105 my $class_name = 'org.terrier.applications.FileIndexer'; 106 my $worker_count = 0; 107 my $terrier_home = ''; 108 my $collection_path = ''; 109 my $batch_size = 0; 110 my $debug = 0; 111 # - parse arguments 112 my $argument; 113 for (my $i = 0; $i < scalar(@ARGV); $i++) 114 { 115 $argument = $ARGV[$i]; 116 if ('-workers' eq $argument) 117 { 118 $i++; 119 $worker_count = $ARGV[$i]; 120 } 121 elsif ('-terrier' eq $argument) 122 { 123 $i++; 124 $terrier_home = $ARGV[$i]; 125 } 126 elsif ('-collection' eq $argument) 127 { 128 $i++; 129 $collection_path = $ARGV[$i]; 130 } 131 elsif ('-batchsize' eq $argument) 132 { 133 $i++; 134 $batch_size = $ARGV[$i]; 135 } 136 elsif ('-debug' eq $argument) 137 { 138 $debug = 1; 139 } 140 else 141 { 142 &printUsage('Unrecognized argument: ' . $argument); 143 } 144 } 145 print '[SCRIPT] Worker Count: ' . $worker_count . "\n"; 146 print '[SCRIPT] Terrier Home: ' . $terrier_home . "\n"; 147 print '[SCRIPT] Collection: ' . $collection_path . "\n"; 148 print '[SCRIPT] Batch Size: ' . $batch_size . "\n"; 149 print '[SCRIPT] Debug: ' . $debug . "\n"; 150 151 # - check arguments 152 if ($worker_count !~ /^\d+$/) 153 { 154 &printUsage('Worker count must be an integer'); 155 } 156 if ('' eq $terrier_home || !-d $terrier_home) 157 { 158 &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory'); 159 } 160 if ('' eq $collection_path || !-d $collection_path) 161 { 162 &printUsage('Collection path given doesn\'t exist or isn\'t a directory'); 163 } 164 if ($batch_size !~ /^\d+$/) 165 { 166 &printUsage('Batch size count must be an integer'); 167 } 168 if (0 == $worker_count || 0 == $batch_size) 169 { 170 print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n"; 171 $batch_size = 0; 172 } 173 # - derived variables 174 my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh'); 175 176 # 2. Remove any existing index 177 print STDOUT "[SCRIPT] Removing old index files...\n"; 178 my $var_path = &fileCat($terrier_home, 'var'); 179 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!); 180 my @old_files = readdir(DH); 181 closedir(DH); 182 my $old_file; 183 foreach $old_file (@old_files) 184 { 185 if ($old_file =~ /^manifest-\d+.spec/) 186 { 187 my $old_path = &fileCat($var_path, $old_file); 188 &debugPrint($debug, 'deleting ' . $old_path . "\n"); 189 unlink($old_path); 190 } 191 } 192 my $index_path = &fileCat($var_path, 'index'); 193 my $delete_command = 'rm -rf "' . $index_path . '"'; 194 &debugPrint($debug, 'command: ' . $delete_command . "\n"); 195 `$delete_command`; 196 my $assoc_path = &fileCat($terrier_home, 'share', 'images', 'assoc'); 197 $delete_command = 'rm -rf "' . $assoc_path . '"'; 198 &debugPrint($debug, 'command: ' . $delete_command . "\n"); 199 `$delete_command`; 200 201 # 3. Prepare the collection for parallel indexing 202 print STDOUT "[SCRIPT] Prepare collection for indexing...\n"; 204 203 my $prepare_command = $anyclass_exe . ' ' . $class_name . ' -prepare -path "' . $collection_path . '"'; 205 204 if (0 < $batch_size) … … 221 220 } 222 221 } 223 print STDOUT ' - generated ' . $manifest_count . ' manifest files' . "\n"; 224 print STDOUT ' - preparation complete!' . "\n"; 222 print STDOUT '[SCRIPT] => generated ' . $manifest_count . " manifest files\n"; 225 223 if (0 >= $manifest_count) 226 224 { … … 231 229 if (1 == $manifest_count) 232 230 { 233 print STDOUT ' * Index collection with serial processing' . "\n";231 print STDOUT "[SCRIPT] Index collection using serial processing\n"; 234 232 my $manifest_path = &fileCat($var_path, 'manifest-000.spec'); 235 233 my $index_command = $anyclass_exe . ' ' . $class_name . ' -index -path "' . $manifest_path . '" -prefix 000'; … … 240 238 else 241 239 { 242 print STDOUT ' * Index collection with parallel processing (' . $worker_count . ' workers)' . "\n";243 my $mpi_flags = '--show-progress -- timestamp-output --verbose --report-bindings --tag-output';240 print STDOUT '[SCRIPT] Index collection using parallel processing (' . $worker_count . " workers)"; 241 my $mpi_flags = '--show-progress --verbose '; 244 242 my $mpi_conf_path = &fileCat($terrier_home, 'mpi.conf'); 245 243 if (-f $mpi_conf_path) 246 244 { 247 print STDOUT ' - parallel processing on cluster' . "\n";245 print STDOUT "(cluster)\n"; 248 246 $mpi_flags .= ' -nolocal -machinefile "' . $mpi_conf_path . '"'; 249 247 } 250 248 else 251 249 { 252 print STDOUT ' - parallel processing on multicore computer' . "\n";250 print STDOUT "(multicore)\n"; 253 251 } 254 252 my $mpi_command = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' mpiterrierfileindexer "' . $ENV{'GSDLHOME'} . '" "' . $terrier_home . '" ' . $manifest_count; … … 256 254 `$mpi_command`; 257 255 } 258 print STDOUT ' - indexing complete!' . "\n";259 256 260 257 # 5. Merge the indexes 261 258 # - if we performed a serial process above, then this will just rename the 262 259 # index files 263 print STDOUT ' * Merging Indexes' . "\n";260 print STDOUT "[SCRIPT] Merging Indexes\n"; 264 261 my $merge_command = $anyclass_exe . ' ' . $class_name . ' -merge'; 265 262 &debugPrint($debug, 'command: ' . $merge_command . "\n"); 266 263 `$merge_command`; 267 print STDOUT ' - merging complete!' . "\n";268 264 269 265 # Complete! 270 print '[ ' . time() . '] Parallel FileIndexer Complete: ' . localtime() . "\n";266 print '[SCRIPT:' . time() . "] Complete!\n\n"; 271 267 } 272 268
Note:
See TracChangeset
for help on using the changeset viewer.