source: gs2-extensions/parallel-building/trunk/src/bin/script/parallel_terrier_fileindex.pl@ 26998

Last change on this file since 26998 was 26998, checked in by jmt12, 11 years ago

Adding maxdocs variable, lots of debug comments, added some tests for directories rather than just deleting them (causing non-fatal errors if the directories weren't there) and added some extra flags to ensure MPI bound to correct interface (as Medusa has some phantom virtual interfaces that occasionally interfere)

  • Property svn:executable set to *
File size: 8.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# Enhance the Terrier FileIndexer component with parallel processing
6# capability.
7#
8# Copyright (C) 2012 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package parallel_terrier_fileindexer;
27
28BEGIN
29{
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 die "GSDL Extensions not enabled\n" unless defined $ENV{'GSDLEXTS'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
37
38 my $found_parallel_building_ext = 0;
39 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
40 foreach my $e (@extensions)
41 {
42 if ($e eq 'parallel-building')
43 {
44 $found_parallel_building_ext = 1;
45 }
46 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
47 unshift (@INC, "$ext_prefix/perllib");
48 unshift (@INC, "$ext_prefix/perllib/cpan");
49 unshift (@INC, "$ext_prefix/perllib/plugins");
50 unshift (@INC, "$ext_prefix/perllib/plugouts");
51 }
52 if (0 == $found_parallel_building_ext)
53 {
54 die "GSDL Parallel Building Extension not installed\n";
55 }
56}
57
58use strict;
59use warnings;
60
61# /** @function debugPrint
62# */
63sub debugPrint
64{
65 my ($debug, $message) = @_;
66 if ($debug)
67 {
68 print STDERR '[SDEBUG] ' . $message;
69 }
70}
71# /** debugPrint(boolean, String) **/
72
73# /** @function fileCat
74# */
75sub fileCat
76{
77 my $path = join('/', @_);
78 $path =~ s/\/\/+/\//g;
79 return $path;
80}
81# /** fileCat(String, String ...) */
82
83# /** @function printUsage
84# */
85sub printUsage
86{
87 my ($message) = @_;
88 if (defined $message)
89 {
90 print STDERR 'Error! ' . $message . "\n";
91 }
92 print STDERR 'Usage: parallel_terrier_fileindexer.pl -terrier <path> -collection <path> -workers <num> -batchsize <num> [-maxfiles <num>] [-debug]' . "\n\n";
93 print '[' . time() . ']Parallel FileIndexer Complete: ' . localtime() . "\n";
94 exit(0);
95}
96# /** printUsage(String) **/
97
98# /** @function main
99# */
100sub main
101{
102 print '[SCRIPT:' . time() . "] Starting Parallel FileIndexer\n";
103
104 # 1. Initialization
105 my $class_name = 'org.terrier.applications.FileIndexer';
106 my $worker_count = 0;
107 my $terrier_home = '';
108 my $collection_path = '';
109 my $batch_size = 0;
110 my $debug = 0;
111 my $max_files = 0;
112 # - parse arguments
113 my $argument;
114 for (my $i = 0; $i < scalar(@ARGV); $i++)
115 {
116 $argument = $ARGV[$i];
117 if ('-workers' eq $argument)
118 {
119 $i++;
120 $worker_count = $ARGV[$i];
121 }
122 elsif ('-terrier' eq $argument)
123 {
124 $i++;
125 $terrier_home = $ARGV[$i];
126 }
127 elsif ('-collection' eq $argument)
128 {
129 $i++;
130 $collection_path = $ARGV[$i];
131 }
132 elsif ('-batchsize' eq $argument)
133 {
134 $i++;
135 $batch_size = $ARGV[$i];
136 }
137 elsif ('-debug' eq $argument)
138 {
139 $debug = 1;
140 }
141 elsif ('-maxfiles' eq $argument)
142 {
143 $i++;
144 $max_files = $ARGV[$i];
145 }
146 else
147 {
148 &printUsage('Unrecognized argument: ' . $argument);
149 }
150 }
151 print '[SCRIPT] Worker Count: ' . $worker_count . "\n";
152 print '[SCRIPT] Terrier Home: ' . $terrier_home . "\n";
153 print '[SCRIPT] Collection: ' . $collection_path . "\n";
154 print '[SCRIPT] Batch Size: ' . $batch_size . "\n";
155 print '[SCRIPT] Debug: ' . $debug . "\n";
156
157 # - check arguments
158 if ($worker_count !~ /^\d+$/)
159 {
160 &printUsage('Worker count must be an integer');
161 }
162 if ('' eq $terrier_home || !-d $terrier_home)
163 {
164 &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory');
165 }
166 if ('' eq $collection_path || !-d $collection_path)
167 {
168 &printUsage('Collection path given doesn\'t exist or isn\'t a directory');
169 }
170 if ($batch_size !~ /^\d+$/)
171 {
172 &printUsage('Batch size count must be an integer');
173 }
174 if (0 == $worker_count || 0 == $batch_size)
175 {
176 print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n";
177 $batch_size = 0;
178 }
179 # - derived variables
180 my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh');
181
182 # 2. Remove any existing index
183 print STDOUT "[SCRIPT] Removing old index files...\n";
184 my $var_path = &fileCat($terrier_home, 'var');
185 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
186 my @old_files = readdir(DH);
187 closedir(DH);
188 my $old_file;
189 foreach $old_file (@old_files)
190 {
191 if ($old_file =~ /^manifest-\d+.spec/)
192 {
193 my $old_path = &fileCat($var_path, $old_file);
194 &debugPrint($debug, 'deleting ' . $old_path . "\n");
195 unlink($old_path);
196 }
197 my $index_path = &fileCat($var_path, 'index');
198 if (-d $index_path)
199 {
200 my $delete_command = 'rm -rf "' . $index_path . '"';
201 &debugPrint($debug, 'command: ' . $delete_command . "\n");
202 `$delete_command`;
203 }
204 my $assoc_path = &fileCat($terrier_home, 'share', 'images', 'assoc');
205 if (-d $assoc_path)
206 {
207 my $delete_command2 = 'rm -rf "' . $assoc_path . '"';
208 &debugPrint($debug, 'command: ' . $delete_command2 . "\n");
209 `$delete_command2`;
210 }
211 }
212
213 # 3. Prepare the collection for parallel indexing
214 print STDOUT "[SCRIPT] Prepare collection for indexing...\n";
215 my $prepare_command = $anyclass_exe . ' ' . $class_name . ' -prepare -path "' . $collection_path . '"';
216 if (0 < $batch_size)
217 {
218 $prepare_command .= ' -batchsize ' . $batch_size;
219 }
220 if (0 < $max_files)
221 {
222 $prepare_command .= ' -maxfiles ' . $max_files;
223 }
224 &debugPrint($debug, 'command: ' . $prepare_command . "\n");
225 `$prepare_command`;
226 # - count the number of manifest files generated
227 my $manifest_count = 0;
228 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
229 my @files = readdir(DH);
230 closedir(DH);
231 foreach my $file (@files)
232 {
233 if ($file =~ /^manifest-\d+.spec/)
234 {
235 $manifest_count++;
236 }
237 }
238 print STDOUT '[SCRIPT] => generated ' . $manifest_count . " manifest files\n";
239 if (0 >= $manifest_count)
240 {
241 die('Error! Failed to generate any manifest files.');
242 }
243
244 # 4a. If we only have a single manifest, then we call the indexer directly.
245 if (1 == $manifest_count)
246 {
247 print STDOUT "[SCRIPT] Index collection using serial processing\n";
248 my $manifest_path = &fileCat($var_path, 'manifest-000.spec');
249 my $index_command = $anyclass_exe . ' ' . $class_name . ' -index -path "' . $manifest_path . '" -prefix 000';
250 &debugPrint($debug, 'command: ' . $index_command . "\n");
251 `$index_command`;
252 }
253 # 4b. Call OpenMPI enabled executable to perform parallel processing
254 else
255 {
256 print STDOUT "[SCRIPT] Index collection with parallel processing (" . $worker_count . " workers\n";
257 my $mpi_flags = '--show-progress --verbose ';
258 # Excessive force! Ensure we bind to the correct network interface
259 $mpi_flags .= '--mca btl tcp,sm,self --mca btl_tcp_if_include eth0 ';
260 #$mpi_flags .= '-nolocal ';
261 my $mpi_conf_path = &fileCat($terrier_home, 'mpi.conf');
262 if (-f $mpi_conf_path)
263 {
264 print STDOUT "(cluster)\n";
265 $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" ';
266 }
267 else
268 {
269 print STDOUT "(multicore)\n";
270 }
271 my $mpi_command = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' mpiterrierfileindexer "' . $ENV{'GSDLHOME'} . '" "' . $terrier_home . '" ' . $manifest_count;
272 &debugPrint($debug, 'command: ' . $mpi_command . "\n");
273 `$mpi_command`;
274 }
275
276 # 5. Merge the indexes
277 # - if we performed a serial process above, then this will just rename the
278 # index files
279 print STDOUT "[SCRIPT] Merging Indexes\n";
280 my $merge_command = $anyclass_exe . ' ' . $class_name . ' -merge';
281 &debugPrint($debug, 'command: ' . $merge_command . "\n");
282 `$merge_command`;
283
284 # Complete!
285 print '[SCRIPT:' . time() . "] Complete!\n\n";
286 }
287
288&main();
289
2901;
Note: See TracBrowser for help on using the repository browser.