source: gs2-extensions/parallel-building/trunk/src/bin/script/parallel_terrier_fileindex.pl@ 26242

Last change on this file since 26242 was 26242, checked in by jmt12, 12 years ago

Modifications to progress messages to improve extracting information from the logs in an automated fashion

  • Property svn:executable set to *
File size: 8.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# Enhance the Terrier FileIndexer component with parallel processing
6# capability.
7#
8# Copyright (C) 2012 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package parallel_terrier_fileindexer;
27
28BEGIN
29{
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 die "GSDL Extensions not enabled\n" unless defined $ENV{'GSDLEXTS'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugouts");
37
38 my $found_parallel_building_ext = 0;
39 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
40 foreach my $e (@extensions)
41 {
42 if ($e eq 'parallel-building')
43 {
44 $found_parallel_building_ext = 1;
45 }
46 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
47 unshift (@INC, "$ext_prefix/perllib");
48 unshift (@INC, "$ext_prefix/perllib/cpan");
49 unshift (@INC, "$ext_prefix/perllib/plugins");
50 unshift (@INC, "$ext_prefix/perllib/plugouts");
51 }
52 if (0 == $found_parallel_building_ext)
53 {
54 die "GSDL Parallel Building Extension not installed\n";
55 }
56}
57
58use strict;
59use warnings;
60
61# /** @function debugPrint
62# */
63sub debugPrint
64{
65 my ($debug, $message) = @_;
66 if ($debug)
67 {
68 print STDERR '[SDEBUG] ' . $message;
69 }
70}
71# /** debugPrint(boolean, String) **/
72
73# /** @function fileCat
74# */
75sub fileCat
76{
77 my $path = join('/', @_);
78 $path =~ s/\/\/+/\//g;
79 return $path;
80}
81# /** fileCat(String, String ...) */
82
83# /** @function printUsage
84# */
85sub printUsage
86{
87 my ($message) = @_;
88 if (defined $message)
89 {
90 print STDERR 'Error! ' . $message . "\n";
91 }
92 print STDERR 'Usage: parallel_terrier_fileindexer.pl -workers <int> -terrier <path> -collection <path> -batchsize <int> [-debug]' . "\n\n";
93 print '[' . time() . ']Parallel FileIndexer Complete: ' . localtime() . "\n";
94 exit(0);
95}
96# /** printUsage(String) **/
97
98# /** @function main
99# */
100sub main
101{
102 print '[SCRIPT:' . time() . "] Starting Parallel FileIndexer\n";
103
104 # 1. Initialization
105 my $class_name = 'org.terrier.applications.FileIndexer';
106 my $worker_count = 0;
107 my $terrier_home = '';
108 my $collection_path = '';
109 my $batch_size = 0;
110 my $debug = 0;
111 # - parse arguments
112 my $argument;
113 for (my $i = 0; $i < scalar(@ARGV); $i++)
114 {
115 $argument = $ARGV[$i];
116 if ('-workers' eq $argument)
117 {
118 $i++;
119 $worker_count = $ARGV[$i];
120 }
121 elsif ('-terrier' eq $argument)
122 {
123 $i++;
124 $terrier_home = $ARGV[$i];
125 }
126 elsif ('-collection' eq $argument)
127 {
128 $i++;
129 $collection_path = $ARGV[$i];
130 }
131 elsif ('-batchsize' eq $argument)
132 {
133 $i++;
134 $batch_size = $ARGV[$i];
135 }
136 elsif ('-debug' eq $argument)
137 {
138 $debug = 1;
139 }
140 else
141 {
142 &printUsage('Unrecognized argument: ' . $argument);
143 }
144 }
145 print '[SCRIPT] Worker Count: ' . $worker_count . "\n";
146 print '[SCRIPT] Terrier Home: ' . $terrier_home . "\n";
147 print '[SCRIPT] Collection: ' . $collection_path . "\n";
148 print '[SCRIPT] Batch Size: ' . $batch_size . "\n";
149 print '[SCRIPT] Debug: ' . $debug . "\n";
150
151 # - check arguments
152 if ($worker_count !~ /^\d+$/)
153 {
154 &printUsage('Worker count must be an integer');
155 }
156 if ('' eq $terrier_home || !-d $terrier_home)
157 {
158 &printUsage('Terrier home path given doesn\'t exist or isn\'t a directory');
159 }
160 if ('' eq $collection_path || !-d $collection_path)
161 {
162 &printUsage('Collection path given doesn\'t exist or isn\'t a directory');
163 }
164 if ($batch_size !~ /^\d+$/)
165 {
166 &printUsage('Batch size count must be an integer');
167 }
168 if (0 == $worker_count || 0 == $batch_size)
169 {
170 print STDOUT "Warning! Zero workers or a batch size of zero causes a serial index.\n";
171 $batch_size = 0;
172 }
173 # - derived variables
174 my $anyclass_exe = &fileCat($terrier_home, 'bin', 'anyclass.sh');
175
176 # 2. Remove any existing index
177 print STDOUT "[SCRIPT] Removing old index files...\n";
178 my $var_path = &fileCat($terrier_home, 'var');
179 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
180 my @old_files = readdir(DH);
181 closedir(DH);
182 my $old_file;
183 foreach $old_file (@old_files)
184 {
185 if ($old_file =~ /^manifest-\d+.spec/)
186 {
187 my $old_path = &fileCat($var_path, $old_file);
188 &debugPrint($debug, 'deleting ' . $old_path . "\n");
189 unlink($old_path);
190 }
191 }
192 my $index_path = &fileCat($var_path, 'index');
193 my $delete_command = 'rm -rf "' . $index_path . '"';
194 &debugPrint($debug, 'command: ' . $delete_command . "\n");
195 `$delete_command`;
196 my $assoc_path = &fileCat($terrier_home, 'share', 'images', 'assoc');
197 $delete_command = 'rm -rf "' . $assoc_path . '"';
198 &debugPrint($debug, 'command: ' . $delete_command . "\n");
199 `$delete_command`;
200
201 # 3. Prepare the collection for parallel indexing
202 print STDOUT "[SCRIPT] Prepare collection for indexing...\n";
203 my $prepare_command = $anyclass_exe . ' ' . $class_name . ' -prepare -path "' . $collection_path . '"';
204 if (0 < $batch_size)
205 {
206 $prepare_command .= ' -batchsize ' . $batch_size;
207 }
208 &debugPrint($debug, 'command: ' . $prepare_command . "\n");
209 `$prepare_command`;
210 # - count the number of manifest files generated
211 my $manifest_count = 0;
212 opendir(DH, $var_path) or die('Error! Failed to open var path for reading: ' . $!);
213 my @files = readdir(DH);
214 closedir(DH);
215 foreach my $file (@files)
216 {
217 if ($file =~ /^manifest-\d+.spec/)
218 {
219 $manifest_count++;
220 }
221 }
222 print STDOUT '[SCRIPT] => generated ' . $manifest_count . " manifest files\n";
223 if (0 >= $manifest_count)
224 {
225 die('Error! Failed to generate any manifest files.');
226 }
227
228 # 4a. If we only have a single manifest, then we call the indexer directly.
229 if (1 == $manifest_count)
230 {
231 print STDOUT "[SCRIPT] Index collection using serial processing\n";
232 my $manifest_path = &fileCat($var_path, 'manifest-000.spec');
233 my $index_command = $anyclass_exe . ' ' . $class_name . ' -index -path "' . $manifest_path . '" -prefix 000';
234 &debugPrint($debug, 'command: ' . $index_command . "\n");
235 `$index_command`;
236 }
237 # 4b. Call OpenMPI enabled executable to perform parallel processing
238 else
239 {
240 print STDOUT '[SCRIPT] Index collection using parallel processing (' . $worker_count . " workers)";
241 my $mpi_flags = '--show-progress --verbose ';
242 my $mpi_conf_path = &fileCat($terrier_home, 'mpi.conf');
243 if (-f $mpi_conf_path)
244 {
245 print STDOUT "(cluster)\n";
246 $mpi_flags .= ' -nolocal -machinefile "' . $mpi_conf_path . '"';
247 }
248 else
249 {
250 print STDOUT "(multicore)\n";
251 }
252 my $mpi_command = 'mpirun ' . $mpi_flags . ' -np ' . ($worker_count + 1) . ' mpiterrierfileindexer "' . $ENV{'GSDLHOME'} . '" "' . $terrier_home . '" ' . $manifest_count;
253 &debugPrint($debug, 'command: ' . $mpi_command . "\n");
254 `$mpi_command`;
255 }
256
257 # 5. Merge the indexes
258 # - if we performed a serial process above, then this will just rename the
259 # index files
260 print STDOUT "[SCRIPT] Merging Indexes\n";
261 my $merge_command = $anyclass_exe . ' ' . $class_name . ' -merge';
262 &debugPrint($debug, 'command: ' . $merge_command . "\n");
263 `$merge_command`;
264
265 # Complete!
266 print '[SCRIPT:' . time() . "] Complete!\n\n";
267 }
268
269&main();
270
2711;
Note: See TracBrowser for help on using the repository browser.