source: gs2-extensions/parallel-building/trunk/src/bin/script/buildcol.pl@ 24667

Last change on this file since 24667 was 24667, checked in by jmt12, 13 years ago

Adding another sanity test to prevent parallel building when infodb is set to GDBM

  • Property svn:executable set to *
File size: 27.1 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will build a particular collection.
30
31package buildcol;
32
33BEGIN
34{
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/XML/XPath");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
43
44 if (defined $ENV{'GSDLEXTS'})
45 {
46 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
47 foreach my $e (@extensions)
48 {
49 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
50 unshift (@INC, "$ext_prefix/perllib");
51 unshift (@INC, "$ext_prefix/perllib/cpan");
52 unshift (@INC, "$ext_prefix/perllib/plugins");
53 unshift (@INC, "$ext_prefix/perllib/classify");
54 }
55 }
56 if (defined $ENV{'GSDL3EXTS'})
57 {
58 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
59 foreach my $e (@extensions)
60 {
61 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
62 unshift (@INC, "$ext_prefix/perllib");
63 unshift (@INC, "$ext_prefix/perllib/cpan");
64 unshift (@INC, "$ext_prefix/perllib/plugins");
65 unshift (@INC, "$ext_prefix/perllib/classify");
66 }
67 }
68}
69
70use colcfg;
71use dbutil;
72use util;
73use scriptutil;
74use FileHandle;
75use gsprintf;
76use printusage;
77use parse2;
78
79use strict;
80no strict 'refs'; # allow filehandles to be variables and vice versa
81no strict 'subs'; # allow barewords (eg STDERR) as function arguments
82
83
84my $mode_list =
85 [ { 'name' => "all",
86 'desc' => "{buildcol.mode.all}" },
87 { 'name' => "compress_text",
88 'desc' => "{buildcol.mode.compress_text}" },
89 { 'name' => "build_index",
90 'desc' => "{buildcol.mode.build_index}" },
91 { 'name' => "infodb",
92 'desc' => "{buildcol.mode.infodb}" } ];
93
94my $sec_index_list =
95 [ {'name' => "never",
96 'desc' => "{buildcol.sections_index_document_metadata.never}" },
97 {'name' => "always",
98 'desc' => "{buildcol.sections_index_document_metadata.always}" },
99 {'name' => "unless_section_metadata_exists",
100 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
101 ];
102
103my $arguments =
104 [ { 'name' => "remove_empty_classifications",
105 'desc' => "{buildcol.remove_empty_classifications}",
106 'type' => "flag",
107 'reqd' => "no",
108 'modegli' => "2" },
109 { 'name' => "archivedir",
110 'desc' => "{buildcol.archivedir}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "builddir",
115 'desc' => "{buildcol.builddir}",
116 'type' => "string",
117 'reqd' => "no",
118 'hiddengli' => "yes" },
119# { 'name' => "cachedir",
120# 'desc' => "{buildcol.cachedir}",
121# 'type' => "string",
122# 'reqd' => "no" },
123 { 'name' => "collectdir",
124 'desc' => "{buildcol.collectdir}",
125 'type' => "string",
126 # parsearg left "" as default
127 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
128 'reqd' => "no",
129 'hiddengli' => "yes" },
130 { 'name' => "site",
131 'desc' => "{buildcol.site}",
132 'type' => "string",
133 'deft' => "",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => "debug",
137 'desc' => "{buildcol.debug}",
138 'type' => "flag",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "faillog",
142 'desc' => "{buildcol.faillog}",
143 'type' => "string",
144 # parsearg left "" as default
145 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
146 'reqd' => "no",
147 'modegli' => "3" },
148 # Parallel Build Customization
149 # - this option was called 'index' but that doesn't get thru parsing [jmt12]
150 { 'name' => "indexname",
151 'desc' => "{buildcol.index}",
152 'type' => "string",
153 'reqd' => "no",
154 'modegli' => "3" },
155 # Parallel Build Customization
156 # - a new option to separate building of index levels [jmt12]
157 { 'name' => "indexlevel",
158 'desc' => "{buildcol.indexlevel}",
159 'type' => "string",
160 'reqd' => "no",
161 'modegli' => "3" },
162 { 'name' => "incremental",
163 'desc' => "{buildcol.incremental}",
164 'type' => "flag",
165 'hiddengli' => "yes" },
166 { 'name' => "keepold",
167 'desc' => "{buildcol.keepold}",
168 'type' => "flag",
169 'reqd' => "no",
170 #'modegli' => "3",
171 'hiddengli' => "yes" },
172 { 'name' => "removeold",
173 'desc' => "{buildcol.removeold}",
174 'type' => "flag",
175 'reqd' => "no",
176 #'modegli' => "3",
177 'hiddengli' => "yes" },
178 { 'name' => "language",
179 'desc' => "{scripts.language}",
180 'type' => "string",
181 'reqd' => "no",
182 'modegli' => "3" },
183 { 'name' => "maxdocs",
184 'desc' => "{buildcol.maxdocs}",
185 'type' => "int",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "maxnumeric",
189 'desc' => "{buildcol.maxnumeric}",
190 'type' => "int",
191 'reqd' => "no",
192 'deft' => "4",
193 'range' => "4,512",
194 'modegli' => "3" },
195 { 'name' => "mode",
196 'desc' => "{buildcol.mode}",
197 'type' => "enum",
198 'list' => $mode_list,
199 # parsearg left "" as default
200# 'deft' => "all",
201 'reqd' => "no",
202 'modegli' => "3" },
203 { 'name' => "no_strip_html",
204 'desc' => "{buildcol.no_strip_html}",
205 'type' => "flag",
206 'reqd' => "no",
207 'modegli' => "3" },
208 { 'name' => "no_text",
209 'desc' => "{buildcol.no_text}",
210 'type' => "flag",
211 'reqd' => "no",
212 'modegli' => "2" },
213 { 'name' => "sections_index_document_metadata",
214 'desc' => "{buildcol.sections_index_document_metadata}",
215 'type' => "enum",
216 'list' => $sec_index_list,
217 'reqd' => "no",
218 'modegli' => "2" },
219 { 'name' => "out",
220 'desc' => "{buildcol.out}",
221 'type' => "string",
222 'deft' => "STDERR",
223 'reqd' => "no",
224 'hiddengli' => "yes" },
225 { 'name' => "verbosity",
226 'desc' => "{buildcol.verbosity}",
227 'type' => "int",
228 # parsearg left "" as default
229 #'deft' => "2",
230 'reqd' => "no",
231 'modegli' => "3" },
232 { 'name' => "gli",
233 'desc' => "",
234 'type' => "flag",
235 'reqd' => "no",
236 'hiddengli' => "yes" },
237 { 'name' => "xml",
238 'desc' => "{scripts.xml}",
239 'type' => "flag",
240 'reqd' => "no",
241 'hiddengli' => "yes" },
242 { 'name' => "parallel",
243 'desc' => "{scripts.parallel}",
244 'type' => "flag",
245 'reqd' => "no",
246 'hiddengli' => "yes" },
247 ];
248
249my $options = { 'name' => "buildcol.pl",
250 'desc' => "{buildcol.desc}",
251 'args' => $arguments };
252
253
254# globals
255my $collection;
256my $configfilename;
257my $out;
258
259# used to signify "gs2"(default) or "gs3"
260my $gs_mode = "gs2";
261
262## @method gsprintf()
263# Print a string to the screen after looking it up from a locale dependant
264# strings file. This function is losely based on the idea of resource
265# bundles as used in Java.
266#
267# @param $error The STDERR stream.
268# @param $text The string containing GS keys that should be replaced with
269# their locale dependant equivilents.
270# @param $out The output stream.
271# @return The locale-based string to output.
272#
273sub gsprintf()
274{
275 return &gsprintf::gsprintf(@_);
276}
277## gsprintf() ##
278
279&main();
280
281## @method main()
282#
283# [Parses up and validates the arguments to the build process before creating
284# the appropriate build process to do the actual work - John]
285#
286# @note Added true incremental support - John Thompson, DL Consulting Ltd.
287# @note There were several bugs regarding using directories other than
288# "import" or "archives" during import and build quashed. - John
289# Thompson, DL Consulting Ltd.
290#
291# @param $incremental If true indicates this build should not regenerate all
292# the index and metadata files, and should instead just
293# append the information found in the archives directory
294# to the existing files. If this requires some complex
295# work so as to correctly insert into a classifier so be
296# it. Of course none of this is done here - instead the
297# incremental argument is passed to the document
298# processor.
299#
300sub main
301{
302 # command line args
303 # - new argument to allow control of index level [jmt12]
304 my ($verbosity, $archivedir, $cachedir, $builddir, $site, $maxdocs,
305 $debug, $mode, $indexname, $removeold, $keepold,
306 $incremental, $incremental_mode,
307 $remove_empty_classifications,
308 $collectdir, $build, $type, $textindex,
309 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
310 $sections_index_document_metadata, $maxnumeric, $indexlevel,
311 $parallel);
312
313 my $xml = 0;
314 my $hashParsingResult = {};
315 # general options available to all plugins
316 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
317
318 # If parse returns -1 then something has gone wrong
319 if ($intArgLeftinAfterParsing == -1)
320 {
321 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
322 die "\n";
323 }
324
325 foreach my $strVariable (keys %$hashParsingResult)
326 {
327 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
328 }
329
330 # If $language has been specified, load the appropriate resource bundle
331 # (Otherwise, the default resource bundle will be loaded automatically)
332 if ($language && $language =~ /\S/) {
333 &gsprintf::load_language_specific_resource_bundle($language);
334 }
335
336 if ($xml) {
337 &PrintUsage::print_xml_usage($options);
338 print "\n";
339 return;
340 }
341
342 if ($gli) { # the gli wants strings to be in UTF-8
343 &gsprintf::output_strings_in_UTF8;
344 }
345
346 # now check that we had exactly one leftover arg, which should be
347 # the collection name. We don't want to do this earlier, cos
348 # -xml arg doesn't need a collection name
349 # Or if the user specified -h, then we output the usage also
350 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
351 {
352 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
353 die "\n";
354 }
355
356 $textindex = "";
357 my $close_out = 0;
358 if ($out !~ /^(STDERR|STDOUT)$/i) {
359 open (OUT, ">$out") ||
360 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
361 $out = "buildcol::OUT";
362 $close_out = 1;
363 }
364 $out->autoflush(1);
365
366 # get and check the collection
367 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
368 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
369 die "\n";
370 }
371
372 if ($faillog eq "") {
373 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
374 }
375 # note that we're appending to the faillog here (import.pl clears it each time)
376 # this could potentially create a situation where the faillog keeps being added
377 # to over multiple builds (if the import process is being skipped)
378 open (FAILLOG, ">>$faillog") ||
379 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
380 $faillog = 'buildcol::FAILLOG';
381 $faillog->autoflush(1);
382
383 # Don't know why this didn't already happen, but now collection specific
384 # classify and plugins directory also added to include path
385 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); # [jmt12]
386 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify"); # [jmt12]
387 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins"); # [jmt12]
388
389 # Read in the collection configuration file.
390 my ($collectcfg, $buildtype, $orthogonalbuildtypes);
391 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
392 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
393
394 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
395 if (!defined($collectcfg->{'infodbtype'}))
396 {
397 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
398 }
399 # sanity check - you currently can't have GDBM as the infodb while
400 # asking for a parallel build
401 elsif ($collectcfg->{'infodbtype'} eq 'gdbm' && $parallel)
402 {
403 print STDERR "WARNING: Parallel builds not supported by GDBM - reverting to serial build\n";
404 $parallel = 0;
405 }
406 # sanity check - you currently can't have SQLite as the infodb while
407 # asking for a parallel build
408 elsif ($collectcfg->{'infodbtype'} eq 'sqlite' && $parallel)
409 {
410 print STDERR "WARNING: Parallel builds not current supported by SQLite - reverting to serial build\n";
411 $parallel = 0;
412 }
413
414 if ($verbosity !~ /\d+/) {
415 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
416 $verbosity = $collectcfg->{'verbosity'};
417 } else {
418 $verbosity = 2; # the default
419 }
420 }
421 # we use searchtype for determining buildtype, but for old versions, use buildtype
422 if (defined $collectcfg->{'buildtype'}) {
423 $buildtype = $collectcfg->{'buildtype'};
424 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
425 $buildtype = "mgpp";
426 } else {
427 $buildtype = "mg"; #mg is the default
428 }
429
430 if (defined $collectcfg->{'orthogonalbuildtypes'}) {
431 $orthogonalbuildtypes = $collectcfg->{'orthogonalbuildtypes'};
432 }
433
434 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
435 $archivedir = $collectcfg->{'archivedir'};
436 }
437 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
438 $cachedir = $collectcfg->{'cachedir'};
439 }
440 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
441 $builddir = $collectcfg->{'builddir'};
442 }
443 if ($maxdocs !~ /\-?\d+/) {
444 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
445 $maxdocs = $collectcfg->{'maxdocs'};
446 } else {
447 $maxdocs = -1; # the default
448 }
449 }
450 if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
451 $maxnumeric = $collectcfg->{'maxnumeric'};
452 }
453
454 if ($maxnumeric < 4 || $maxnumeric > 512) {
455 $maxnumeric = 4;
456 }
457
458 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
459 $debug = 1;
460 }
461 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
462 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
463 $mode = $collectcfg->{'mode'};
464 } else {
465 $mode = "all"; # the default
466 }
467 }
468 # - 'index' doesn't make it through parsing so I renamed this option
469 # 'indexname' [jmt12]
470 if (defined $collectcfg->{'indexname'} && $indexname eq "")
471 {
472 $indexname = $collectcfg->{'indexname'};
473 }
474 # - we may also define the index level to build now
475 if (defined $collectcfg->{'indexlevel'} && $indexlevel eq "")
476 {
477 $indexlevel = $collectcfg->{'indexlevel'};
478 }
479 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
480 if ($collectcfg->{'no_text'} =~ /^true$/i) {
481 $no_text = 1;
482 }
483 }
484 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
485 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
486 $no_strip_html = 1;
487 }
488 }
489 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
490 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
491 $remove_empty_classifications = 1;
492 }
493 }
494
495 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
496 $textindex = $collectcfg->{'textcompress'};
497 }
498 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
499 $gli = 1;
500 }
501
502 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
503 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
504 }
505
506 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
507 $sections_index_document_metadata = "never";
508 }
509
510 ($removeold, $keepold, $incremental, $incremental_mode)
511 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
512 $incremental, "building",
513 $collectcfg);
514
515 $gli = 0 unless defined $gli;
516
517 # New argument to track whether build is incremental
518 $incremental = 0 unless defined $incremental;
519
520 print STDERR "<Build>\n" if $gli;
521
522 #set the text index
523 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
524 if ($textindex eq "") {
525 $textindex = "text";
526 }
527 }
528 else {
529 $textindex = "section:text";
530 }
531
532 # fill in the default archives and building directories if none
533 # were supplied, turn all \ into / and remove trailing /
534
535 my ($realarchivedir, $realbuilddir);
536 # Modified so that the archivedir, if provided as an argument, is made
537 # absolute if it isn't already
538 if ($archivedir eq "")
539 {
540 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
541 }
542 else
543 {
544 $archivedir = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
545 }
546 # End Mod
547 $archivedir =~ s/[\\\/]+/\//g;
548 $archivedir =~ s/\/$//;
549
550 if ($builddir eq "") {
551 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building");
552 if ($incremental) {
553 &gsprintf($out, "{buildcol.incremental_default_builddir}\n");
554 }
555 }
556 $builddir =~ s/[\\\/]+/\//g;
557 $builddir =~ s/\/$//;
558
559 # update the archive cache if needed
560 if ($cachedir) {
561 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
562 if ($verbosity >= 1);
563
564 $cachedir =~ s/[\\\/]+$//;
565 $cachedir .= "/collect/$collection" unless
566 $cachedir =~ /collect\/$collection/;
567
568 $realarchivedir = "$cachedir/archives";
569 $realbuilddir = "$cachedir/building";
570 &util::mk_all_dir ($realarchivedir);
571 &util::mk_all_dir ($realbuilddir);
572 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
573
574 } else {
575 $realarchivedir = $archivedir;
576 $realbuilddir = $builddir;
577 }
578
579 # build it in realbuilddir
580 &util::mk_all_dir ($realbuilddir);
581
582 my ($buildertype, $builderdir, $builder);
583 # if a builder class has been created for this collection, use it
584 # otherwise, use the mg or mgpp builder
585 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
586 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
587 $buildertype = "custombuilder";
588 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
589 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
590 $buildertype = "custombuilder";
591 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
592 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
593 $buildertype = "${collection}builder";
594 } else {
595
596 $builderdir = undef;
597 if ($buildtype ne "") {
598 # caters for extension-based build types, such as 'solr'
599 $buildertype = $buildtype."builder";
600 }
601 else {
602 # Default to mgpp
603 $buildertype = "mgppbuilder";
604 }
605 }
606 # check for extension specific builders
607 # (that will then be run after main builder.pm
608
609 my @builderdir_list = ($builderdir);
610 my @buildertype_list = ($buildertype);
611
612 if (defined $orthogonalbuildtypes) {
613 foreach my $obt (@$orthogonalbuildtypes) {
614
615 push(@builderdir_list,undef); # rely on @INC to find it
616 push(@buildertype_list,$obt."Builder");
617 }
618 }
619
620 # Set up array of the main builder.pm, followed by any ones
621 # from the extension folders
622
623 my $num_builders = scalar(@buildertype_list);
624 my @builders = ();
625
626 for (my $i=0; $i<$num_builders; $i++) {
627 my $this_builder;
628 my $this_buildertype = $buildertype_list[$i];
629 my $this_builderdir = $builderdir_list[$i];
630
631 if ((defined $this_builderdir) && ($this_builderdir ne "")) {
632 require "$this_builderdir/$this_buildertype.pm";
633 }
634 else {
635 require "$this_buildertype.pm";
636 }
637
638 eval("\$this_builder = new $this_buildertype(\$site, \$collection, " .
639 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
640 "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
641 "\$remove_empty_classifications, " .
642 "\$out, \$no_text, \$faillog, \$gli)");
643 die "$@" if $@;
644
645 push(@builders,$this_builder);
646 }
647
648 # Init phase for builders
649 for (my $i=0; $i<$num_builders; $i++) {
650 my $this_buildertype = $buildertype_list[$i];
651 my $this_builderdir = $builderdir_list[$i];
652 my $this_builder = $builders[$i];
653
654 $this_builder->init();
655 $this_builder->set_maxnumeric($maxnumeric);
656
657 if (($this_buildertype eq "mgppbuilder") && $no_strip_html) {
658 $this_builder->set_strip_html(0);
659 }
660 if ($sections_index_document_metadata ne "never") {
661 $this_builder->set_sections_index_document_metadata($sections_index_document_metadata);
662 }
663 }
664
665 # Run the requested passes
666
667 # Parallel Building Support
668 # - if parallel building is requested then we subvert the normal 'all' mode
669 # process, insert attempting to create an XML 'recipe' for building this
670 # collection. We then pass this recipe to an Open MPI augmented compiled
671 # executable (which will in turn make multiple calls back to buildcol.pl
672 # according to the instructions in the recipe)!
673 if ($parallel)
674 {
675 print $out "*** parallel building\n";
676 # we initially create the recipe as a datastructure to make it easier for
677 # each builder to determine what has already been defined
678 # - each step of the recipe will have a command as a string and a (possibly
679 # empty) array of steps that depend on this step (possibly recursive)
680 print $out "Generating build 'recipe'\n";
681 my $recipe = [];
682 # pass to each builder to have it populated with appropriate commands
683 map { local $_=$_; $_->prepare_build_recipe($collection, $recipe); } @builders;
684 # now write the recipe to an XML file, resolving any path macros
685 my $max_parallel_tasks = scalar(@{$recipe});
686 my $xml_lines = ();
687 push(@{$xml_lines},'<?xml version="1.0" standalone="no" ?>');
688 push(@{$xml_lines},'<Recipe>');
689 foreach my $item (@{$recipe})
690 {
691 my $max_parallel_child_tasks = &print_recipe($xml_lines, $item);
692 if ($max_parallel_child_tasks > $max_parallel_tasks)
693 {
694 $max_parallel_tasks = $max_parallel_child_tasks;
695 }
696 }
697 push(@{$xml_lines}, '</Recipe>');
698 my $recipe_path = &util::get_tmp_filename('.xml');
699 open(XMLOUT, ">:utf8", $recipe_path) or die("Error! Failed to open recipe file for writing: " . $recipe_path . "\nReason: " . $!);
700 print XMLOUT join("\n", @{$xml_lines});
701 close(XMLOUT);
702 # determine the 'optimal' number of threads (based on number of processor
703 # cores and number of indexes)
704 my $number_of_threads = $max_parallel_tasks + 1; # any more is waste
705 my $number_of_cores = `grep "processor" /proc/cpuinfo | wc -l`;
706 $number_of_cores =~ s/\r?\n//g;
707 print $out "Calculating optimal threads => Max parallel tasks: $max_parallel_tasks, Number of cores: $number_of_cores\n";
708 if ($number_of_cores =~ /^\d+$/ && $max_parallel_tasks > $number_of_cores)
709 {
710 # optimal threads for processor bound load (even though it's probably
711 # IO load that has us bound)
712 $number_of_threads = $number_of_cores + 1;
713 }
714 print $out "Optimal threads: " . $number_of_threads . "\n";
715 # call mpibuildcol executable using mpirun and passing path to recipe
716 #rint "The path *in* Perl: " . $ENV{'PATH'} . "\n";
717 my $mpirun_cmd = 'mpirun -n ' . $number_of_threads . ' mpibuildcol "' . $recipe_path . '"';
718 #$mpirun_cmd = 'echo $PATH; echo $PERLLIB;';
719 print $out "Running command: " . $mpirun_cmd . "\n";
720 print `$mpirun_cmd`;
721 # clean up recipe
722 #unlink($recipe_path);
723 }
724 # Normal mode - run compress text, followed by index building, followed by
725 # infodb
726 elsif ($mode =~ /^all$/i) {
727
728 # 'map' modifies the elements of the original array, so calling
729 # methods -- as done below -- will cause (by default) @builders
730 # to be changed to whatever these functions return (which is *not*
731 # what we want -- we want to leave the values unchanged)
732 # => Use 'local' (dynamic scoping) to give each 'map' call its
733 # own local copy This could also be done with:
734 # (my $new =$_)->method(); $new
735 # but is a bit more cumbersome to write
736
737 map { local $_=$_; $_->compress_text($textindex); } @builders;
738 # - note we pass the required indexname and indexlevel (if specified)
739 # to the processor
740 map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
741 map { local $_=$_; $_->make_infodatabase(); } @builders;
742 map { local $_=$_; $_->collect_specific(); } @builders;
743 } elsif ($mode =~ /^compress_text$/i) {
744 map { local $_=$_; $_->compress_text($textindex); } @builders;
745 } elsif ($mode =~ /^build_index$/i) {
746 map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
747 } elsif ($mode =~ /^infodb$/i) {
748 map { local $_=$_; $_->make_infodatabase(); } @builders;
749 } else {
750 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
751 }
752
753 if (!$debug && !$parallel) {
754 map {local $_=$_; $_->make_auxiliary_files(); } @builders;
755 }
756 map {local $_=$_; $_->deinit(); } @builders;
757
758 if (($realbuilddir ne $builddir) && !$debug) {
759 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
760 if ($verbosity >= 1);
761 &util::rm_r ($builddir);
762 &util::cp_r ($realbuilddir, $builddir);
763 }
764
765 close OUT if $close_out;
766 close FAILLOG;
767
768 print STDERR "</Build>\n" if $gli;
769}
770
771sub print_recipe
772{
773 my ($xml_lines, $item) = @_;
774 my $max_parallel_tasks = 0;
775
776 # start building up the command in our xml buffer
777 push(@{$xml_lines}, '<Task>');
778 my $command = $item->{'command'};
779 $command =~ s/&/&amp;/g;
780 $command =~ s/</&lt;/g;
781 $command =~ s/>/&gt;/g;
782 push(@{$xml_lines}, '<Command>' . $command . '</Command>');
783
784 # - print children before closing task
785 if (defined $item->{'children'})
786 {
787 $max_parallel_tasks = scalar(@{$item->{'children'}});
788
789 foreach my $child_item (@{$item->{'children'}})
790 {
791 my $max_parallel_child_tasks = &print_recipe($xml_lines, $child_item);
792 if ($max_parallel_child_tasks > $max_parallel_tasks)
793 {
794 $max_parallel_tasks = $max_parallel_child_tasks;
795 }
796 }
797 }
798 # - now we can close the task having printed nested children
799 push(@{$xml_lines},'</Task>');
800 # done
801 return $max_parallel_tasks;
802}
Note: See TracBrowser for help on using the repository browser.