1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # Enhance the Terrier FileIndexer component with parallel processing
|
---|
6 | # capability.
|
---|
7 | #
|
---|
8 | # Copyright (C) 2012 New Zealand Digital Library Project
|
---|
9 | #
|
---|
10 | # This program is free software; you can redistribute it and/or modify
|
---|
11 | # it under the terms of the GNU General Public License as published by
|
---|
12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
13 | # (at your option) any later version.
|
---|
14 | #
|
---|
15 | # This program is distributed in the hope that it will be useful,
|
---|
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | # GNU General Public License for more details.
|
---|
19 | #
|
---|
20 | # You should have received a copy of the GNU General Public License
|
---|
21 | # along with this program; if not, write to the Free Software
|
---|
22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | #
|
---|
24 | ###########################################################################
|
---|
25 |
|
---|
26 | package parallel_terrier_fileindexer;
|
---|
27 |
|
---|
28 | BEGIN
|
---|
29 | {
|
---|
30 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
31 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
32 | die "GSDL Extensions not enabled\n" unless defined $ENV{'GSDLEXTS'};
|
---|
33 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
34 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
|
---|
35 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
|
---|
36 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
|
---|
37 |
|
---|
38 | my $found_parallel_building_ext = 0;
|
---|
39 | my @extensions = split(/:/,$ENV{'GSDLEXTS'});
|
---|
40 | foreach my $e (@extensions)
|
---|
41 | {
|
---|
42 | if ($e eq 'parallel-building')
|
---|
43 | {
|
---|
44 | $found_parallel_building_ext = 1;
|
---|
45 | }
|
---|
46 | my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
|
---|
47 | unshift (@INC, "$ext_prefix/perllib");
|
---|
48 | unshift (@INC, "$ext_prefix/perllib/cpan");
|
---|
49 | unshift (@INC, "$ext_prefix/perllib/plugins");
|
---|
50 | unshift (@INC, "$ext_prefix/perllib/plugouts");
|
---|
51 | }
|
---|
52 | if (0 == $found_parallel_building_ext)
|
---|
53 | {
|
---|
54 | die "GSDL Parallel Building Extension not installed\n";
|
---|
55 | }
|
---|
56 | }
|
---|
57 |
|
---|
58 | use strict;
|
---|
59 | use warnings;
|
---|
60 |
|
---|
61 | # /** @function debugPrint
|
---|
62 | # */
|
---|
63 | sub debugPrint
|
---|
64 | {
|
---|
65 | my ($debug, $message) = @_;
|
---|
66 | if ($debug)
|
---|
67 | {
|
---|
68 | print STDERR '[SDEBUG] ' . $message;
|
---|
69 | }
|
---|
70 | }
|
---|
71 | # /** debugPrint(boolean, String) **/
|
---|
72 |
|
---|
73 | # /** @function fileCat
|
---|
74 | # */
|
---|
75 | sub fileCat
|
---|
76 | {
|
---|
77 | my $path = join('/', @_);
|
---|
78 | $path =~ s/\/\/+/\//g;
|
---|
79 | return $path;
|
---|
80 | }
|
---|
81 | # /** fileCat(String, String ...) */
|
---|
82 |
|
---|
83 | # /** @function printUsage
|
---|
84 | # */
|
---|
85 | sub printUsage
|
---|
86 | {
|
---|
87 | my ($message) = @_;
|
---|
88 | if (defined $message)
|
---|
89 | {
|
---|
90 | print STDERR 'Error! ' . $message . "\n";
|
---|
91 | }
|
---|
92 | print STDERR 'Usage: parallel_terrier_fileindexer.pl -terrier <path> -collection <path> -workers <num> -batchsize <num> [-maxfiles <num>] [-debug]' . "\n\n";
|
---|
93 | print '[' . time() . ']Parallel FileIndexer Complete: ' . localtime() . "\n";
|
---|
94 | exit(0);
|
---|
95 | }
|
---|
96 | # /** printUsage(String) **/
|
---|
97 |
|
---|
98 | # /** @function main
|
---|
99 | # */
|
---|
100 | sub main
|
---|
101 | {
|
---|
102 | print '[SCRIPT:' . time() . "] Starting Parallel FileIndexer\n";
|
---|
103 |
|
---|
104 | # 1. Initialization
|
---|
105 | my $class_name = 'org.terrier.applications.FileIndexer';
|
---|
106 | my $worker_count = 0;
|
---|
107 | my $terrier_home = '';
|
---|
108 | my $collection_path = '';
|
---|
109 | my $batch_size = 0;
|
---|
110 | my $debug = 0;
|
---|
111 | my $max_files = 0;
|
---|
112 | # - parse arguments
|
---|
113 | my $argument;
|
---|
114 | for (my $i = 0; $i < scalar(@ARGV); $i++)
|
---|
115 | {
|
---|
116 | $argument = $ARGV[$i];
|
---|
117 | if ('-workers' eq $argument)
|
---|
118 | {
|
---|
119 | $i++;
|
---|
120 | $worker_count = $ARGV[$i];
|
---|
121 | }
|
---|
122 | elsif ('-terrier' eq $argument)
|
---|
123 | {
|
---|
124 | $i++;
|
---|
125 | $terrier_home = $ARGV[$i];
|
---|
126 | }
|
---|
127 | elsif ('-collection' eq $argument)
|
---|
128 | {
|
---|
129 | $i++;
|
---|
130 | $collection_path = $ARGV[$i];
|
---|
131 | }
|
---|
132 | elsif ('-batchsize' eq $argument)
|
---|
133 | {
|
---|
134 | $i++;
|
---|
135 | $batch_size = $ARGV[$i];
|
---|
136 | }
|
---|
137 | elsif ('-debug' eq $argument)
|
---|
138 | {
|
---|
139 | $debug = 1;
|
---|
140 | }
|
---|
141 | elsif ('-maxfiles' eq $argument)
|
---|
142 | {
|
---|
143 | $i++;
|
---|
144 | $max_files = $ARGV[$i];
|
---|
145 | }
|
---|
146 | else
|
---|
147 | {
|
---|
148 | &printUsage('Unrecognized argument: ' . $argument);
|
---|
149 | }
|
---|
150 | }
|
---|
151 | print '[SCRIPT] Worker Count: ' . $worker_count . "\n";
|
---|
152 | print '[SCRIPT] Terrier Home: ' . $terrier_home . "\n";
|
---|
153 | print '[SCRIPT] Collection: ' . $collection_path . "\n";
|
---|
154 | print '[SCRIPT] Batch Size: ' . $batch_size . "\n";
|
---|
155 | print '[SCRIPT] Debug: ' . $debug . "\n";
|
---|
156 |
|
---|
157 | # - check arguments
|
---|
158 | if ($worker_count !~ /^\d+$/)
|
---|
159 | {
|
---|
160 | &printUsage('Worker count must be an integer');
|
---|
161 | }
|
---|
162 | if ('' eq $terrier_home || !-d $terrier_home)
|
---|
163 | {
|
---|
164 | &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory');
|
---|
165 | }
|
---|
166 | if ('' eq $collection_path || !-d $collection_path)
|
---|
167 | {
|
---|
168 | &printUsage('Collection path given doesn\'t exist or isn\'t a directory');
|
---|
169 | }
|
---|
170 | if ($batch_size !~ /^\d+$/)
|
---|
171 | {
|
---|
172 | &printUsage('Batch size count must be an integer');
|
---|
173 | }
|
---|
174 | if (0 == $worker_count || 0 == $batch_size)
|
---|
175 | {
|
---|
176 | print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n";
|
---|
177 | $batch_size = 0;
|
---|
178 | }
|
---|
179 | # - derived variables
|
---|
180 | my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh');
|
---|
181 |
|
---|
182 | # 2. Remove any existing index
|
---|
183 | print STDOUT "[SCRIPT] Removing old index files...\n";
|
---|
184 | my $var_path = &fileCat($terrier_home, 'var');
|
---|
185 | opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
|
---|
186 | my @old_files = readdir(DH);
|
---|
187 | closedir(DH);
|
---|
188 | my $old_file;
|
---|
189 | foreach $old_file (@old_files)
|
---|
190 | {
|
---|
191 | if ($old_file =~ /^manifest-\d+.spec/)
|
---|
192 | {
|
---|
193 | my $old_path = &fileCat($var_path, $old_file);
|
---|
194 | &debugPrint($debug, 'deleting ' . $old_path . "\n");
|
---|
195 | unlink($old_path);
|
---|
196 | }
|
---|
197 | my $index_path = &fileCat($var_path, 'index');
|
---|
198 | if (-d $index_path)
|
---|
199 | {
|
---|
200 | my $delete_command = 'rm -rf "' . $index_path . '"';
|
---|
201 | &debugPrint($debug, 'command: ' . $delete_command . "\n");
|
---|
202 | `$delete_command`;
|
---|
203 | }
|
---|
204 | my $assoc_path = &fileCat($terrier_home, 'share', 'images', 'assoc');
|
---|
205 | if (-d $assoc_path)
|
---|
206 | {
|
---|
207 | my $delete_command2 = 'rm -rf "' . $assoc_path . '"';
|
---|
208 | &debugPrint($debug, 'command: ' . $delete_command2 . "\n");
|
---|
209 | `$delete_command2`;
|
---|
210 | }
|
---|
211 | }
|
---|
212 |
|
---|
213 | # 3. Prepare the collection for parallel indexing
|
---|
214 | print STDOUT "[SCRIPT] Prepare collection for indexing...\n";
|
---|
215 | my $prepare_command = $anyclass_exe . ' ' . $class_name . ' -prepare -path "' . $collection_path . '"';
|
---|
216 | if (0 < $batch_size)
|
---|
217 | {
|
---|
218 | $prepare_command .= ' -batchsize ' . $batch_size;
|
---|
219 | }
|
---|
220 | if (0 < $max_files)
|
---|
221 | {
|
---|
222 | $prepare_command .= ' -maxfiles ' . $max_files;
|
---|
223 | }
|
---|
224 | &debugPrint($debug, 'command: ' . $prepare_command . "\n");
|
---|
225 | `$prepare_command`;
|
---|
226 | # - count the number of manifest files generated
|
---|
227 | my $manifest_count = 0;
|
---|
228 | opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
|
---|
229 | my @files = readdir(DH);
|
---|
230 | closedir(DH);
|
---|
231 | foreach my $file (@files)
|
---|
232 | {
|
---|
233 | if ($file =~ /^manifest-\d+.spec/)
|
---|
234 | {
|
---|
235 | $manifest_count++;
|
---|
236 | }
|
---|
237 | }
|
---|
238 | print STDOUT '[SCRIPT] => generated ' . $manifest_count . " manifest files\n";
|
---|
239 | if (0 >= $manifest_count)
|
---|
240 | {
|
---|
241 | die('Error! Failed to generate any manifest files.');
|
---|
242 | }
|
---|
243 |
|
---|
244 | # 4a. If we only have a single manifest, then we call the indexer directly.
|
---|
245 | if (1 == $manifest_count)
|
---|
246 | {
|
---|
247 | print STDOUT "[SCRIPT] Index collection using serial processing\n";
|
---|
248 | my $manifest_path = &fileCat($var_path, 'manifest-000.spec');
|
---|
249 | my $index_command = $anyclass_exe . ' ' . $class_name . ' -index -path "' . $manifest_path . '" -prefix 000';
|
---|
250 | &debugPrint($debug, 'command: ' . $index_command . "\n");
|
---|
251 | `$index_command`;
|
---|
252 | }
|
---|
253 | # 4b. Call OpenMPI enabled executable to perform parallel processing
|
---|
254 | else
|
---|
255 | {
|
---|
256 | print STDOUT "[SCRIPT] Index collection with parallel processing (" . $worker_count . " workers\n";
|
---|
257 | my $mpi_flags = '--show-progress --verbose ';
|
---|
258 | # Excessive force! Ensure we bind to the correct network interface
|
---|
259 | $mpi_flags .= '--mca btl tcp,sm,self --mca btl_tcp_if_include eth0 ';
|
---|
260 | #$mpi_flags .= '-nolocal ';
|
---|
261 | my $mpi_conf_path = &fileCat($terrier_home, 'mpi.conf');
|
---|
262 | if (-f $mpi_conf_path)
|
---|
263 | {
|
---|
264 | print STDOUT "(cluster)\n";
|
---|
265 | $mpi_flags .= '-machinefile "' . $mpi_conf_path . '" ';
|
---|
266 | }
|
---|
267 | else
|
---|
268 | {
|
---|
269 | print STDOUT "(multicore)\n";
|
---|
270 | }
|
---|
271 | my $mpi_command = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' mpiterrierfileindexer "' . $ENV{'GSDLHOME'} . '" "' . $terrier_home . '" ' . $manifest_count;
|
---|
272 | &debugPrint($debug, 'command: ' . $mpi_command . "\n");
|
---|
273 | `$mpi_command`;
|
---|
274 | }
|
---|
275 |
|
---|
276 | # 5. Merge the indexes
|
---|
277 | # - if we performed a serial process above, then this will just rename the
|
---|
278 | # index files
|
---|
279 | print STDOUT "[SCRIPT] Merging Indexes\n";
|
---|
280 | my $merge_command = $anyclass_exe . ' ' . $class_name . ' -merge';
|
---|
281 | &debugPrint($debug, 'command: ' . $merge_command . "\n");
|
---|
282 | `$merge_command`;
|
---|
283 |
|
---|
284 | # Complete!
|
---|
285 | print '[SCRIPT:' . time() . "] Complete!\n\n";
|
---|
286 | }
|
---|
287 |
|
---|
288 | &main();
|
---|
289 |
|
---|
290 | 1;
|
---|