source: gs2-extensions/parallel-building/trunk/src/bin/script/buildcol.pl@ 24680

Last change on this file since 24680 was 24680, checked in by jmt12, 13 years ago

Removed sanity check for GDBM (where was my sanity - GDBM works, it's just slow because of locking). Added initializer for archiveinf-doc infodb to ensure that any GDBMServer persists throughout parallel building

  • Property svn:executable set to *
File size: 27.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will build a particular collection.
30
31package buildcol;
32
33BEGIN
34{
35 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
36 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
37
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/XML/XPath");
41 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
42 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
43
44 if (defined $ENV{'GSDLEXTS'})
45 {
46 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
47 foreach my $e (@extensions)
48 {
49 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
50 unshift (@INC, "$ext_prefix/perllib");
51 unshift (@INC, "$ext_prefix/perllib/cpan");
52 unshift (@INC, "$ext_prefix/perllib/plugins");
53 unshift (@INC, "$ext_prefix/perllib/classify");
54 }
55 }
56 if (defined $ENV{'GSDL3EXTS'})
57 {
58 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
59 foreach my $e (@extensions)
60 {
61 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
62 unshift (@INC, "$ext_prefix/perllib");
63 unshift (@INC, "$ext_prefix/perllib/cpan");
64 unshift (@INC, "$ext_prefix/perllib/plugins");
65 unshift (@INC, "$ext_prefix/perllib/classify");
66 }
67 }
68}
69
70use colcfg;
71use dbutil;
72use util;
73use scriptutil;
74use FileHandle;
75use gsprintf;
76use printusage;
77use parse2;
78
79use strict;
80no strict 'refs'; # allow filehandles to be variables and vice versa
81no strict 'subs'; # allow barewords (eg STDERR) as function arguments
82
83
84my $mode_list =
85 [ { 'name' => "all",
86 'desc' => "{buildcol.mode.all}" },
87 { 'name' => "compress_text",
88 'desc' => "{buildcol.mode.compress_text}" },
89 { 'name' => "build_index",
90 'desc' => "{buildcol.mode.build_index}" },
91 { 'name' => "infodb",
92 'desc' => "{buildcol.mode.infodb}" } ];
93
94my $sec_index_list =
95 [ {'name' => "never",
96 'desc' => "{buildcol.sections_index_document_metadata.never}" },
97 {'name' => "always",
98 'desc' => "{buildcol.sections_index_document_metadata.always}" },
99 {'name' => "unless_section_metadata_exists",
100 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
101 ];
102
103my $arguments =
104 [ { 'name' => "remove_empty_classifications",
105 'desc' => "{buildcol.remove_empty_classifications}",
106 'type' => "flag",
107 'reqd' => "no",
108 'modegli' => "2" },
109 { 'name' => "archivedir",
110 'desc' => "{buildcol.archivedir}",
111 'type' => "string",
112 'reqd' => "no",
113 'hiddengli' => "yes" },
114 { 'name' => "builddir",
115 'desc' => "{buildcol.builddir}",
116 'type' => "string",
117 'reqd' => "no",
118 'hiddengli' => "yes" },
119# { 'name' => "cachedir",
120# 'desc' => "{buildcol.cachedir}",
121# 'type' => "string",
122# 'reqd' => "no" },
123 { 'name' => "collectdir",
124 'desc' => "{buildcol.collectdir}",
125 'type' => "string",
126 # parsearg left "" as default
127 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
128 'reqd' => "no",
129 'hiddengli' => "yes" },
130 { 'name' => "site",
131 'desc' => "{buildcol.site}",
132 'type' => "string",
133 'deft' => "",
134 'reqd' => "no",
135 'hiddengli' => "yes" },
136 { 'name' => "debug",
137 'desc' => "{buildcol.debug}",
138 'type' => "flag",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "faillog",
142 'desc' => "{buildcol.faillog}",
143 'type' => "string",
144 # parsearg left "" as default
145 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
146 'reqd' => "no",
147 'modegli' => "3" },
148 # Parallel Build Customization
149 # - this option was called 'index' but that doesn't get thru parsing [jmt12]
150 { 'name' => "indexname",
151 'desc' => "{buildcol.index}",
152 'type' => "string",
153 'reqd' => "no",
154 'modegli' => "3" },
155 # Parallel Build Customization
156 # - a new option to separate building of index levels [jmt12]
157 { 'name' => "indexlevel",
158 'desc' => "{buildcol.indexlevel}",
159 'type' => "string",
160 'reqd' => "no",
161 'modegli' => "3" },
162 { 'name' => "incremental",
163 'desc' => "{buildcol.incremental}",
164 'type' => "flag",
165 'hiddengli' => "yes" },
166 { 'name' => "keepold",
167 'desc' => "{buildcol.keepold}",
168 'type' => "flag",
169 'reqd' => "no",
170 #'modegli' => "3",
171 'hiddengli' => "yes" },
172 { 'name' => "removeold",
173 'desc' => "{buildcol.removeold}",
174 'type' => "flag",
175 'reqd' => "no",
176 #'modegli' => "3",
177 'hiddengli' => "yes" },
178 { 'name' => "language",
179 'desc' => "{scripts.language}",
180 'type' => "string",
181 'reqd' => "no",
182 'modegli' => "3" },
183 { 'name' => "maxdocs",
184 'desc' => "{buildcol.maxdocs}",
185 'type' => "int",
186 'reqd' => "no",
187 'hiddengli' => "yes" },
188 { 'name' => "maxnumeric",
189 'desc' => "{buildcol.maxnumeric}",
190 'type' => "int",
191 'reqd' => "no",
192 'deft' => "4",
193 'range' => "4,512",
194 'modegli' => "3" },
195 { 'name' => "mode",
196 'desc' => "{buildcol.mode}",
197 'type' => "enum",
198 'list' => $mode_list,
199 # parsearg left "" as default
200# 'deft' => "all",
201 'reqd' => "no",
202 'modegli' => "3" },
203 { 'name' => "no_strip_html",
204 'desc' => "{buildcol.no_strip_html}",
205 'type' => "flag",
206 'reqd' => "no",
207 'modegli' => "3" },
208 { 'name' => "no_text",
209 'desc' => "{buildcol.no_text}",
210 'type' => "flag",
211 'reqd' => "no",
212 'modegli' => "2" },
213 { 'name' => "sections_index_document_metadata",
214 'desc' => "{buildcol.sections_index_document_metadata}",
215 'type' => "enum",
216 'list' => $sec_index_list,
217 'reqd' => "no",
218 'modegli' => "2" },
219 { 'name' => "out",
220 'desc' => "{buildcol.out}",
221 'type' => "string",
222 'deft' => "STDERR",
223 'reqd' => "no",
224 'hiddengli' => "yes" },
225 { 'name' => "verbosity",
226 'desc' => "{buildcol.verbosity}",
227 'type' => "int",
228 # parsearg left "" as default
229 #'deft' => "2",
230 'reqd' => "no",
231 'modegli' => "3" },
232 { 'name' => "gli",
233 'desc' => "",
234 'type' => "flag",
235 'reqd' => "no",
236 'hiddengli' => "yes" },
237 { 'name' => "xml",
238 'desc' => "{scripts.xml}",
239 'type' => "flag",
240 'reqd' => "no",
241 'hiddengli' => "yes" },
242 { 'name' => "parallel",
243 'desc' => "{scripts.parallel}",
244 'type' => "flag",
245 'reqd' => "no",
246 'hiddengli' => "yes" },
247 ];
248
249my $options = { 'name' => "buildcol.pl",
250 'desc' => "{buildcol.desc}",
251 'args' => $arguments };
252
253
254# globals
255my $collection;
256my $configfilename;
257my $out;
258
259# used to signify "gs2"(default) or "gs3"
260my $gs_mode = "gs2";
261
262## @method gsprintf()
263# Print a string to the screen after looking it up from a locale dependant
264# strings file. This function is losely based on the idea of resource
265# bundles as used in Java.
266#
267# @param $error The STDERR stream.
268# @param $text The string containing GS keys that should be replaced with
269# their locale dependant equivilents.
270# @param $out The output stream.
271# @return The locale-based string to output.
272#
273sub gsprintf()
274{
275 return &gsprintf::gsprintf(@_);
276}
277## gsprintf() ##
278
279&main();
280
281## @method main()
282#
283# [Parses up and validates the arguments to the build process before creating
284# the appropriate build process to do the actual work - John]
285#
286# @note Added true incremental support - John Thompson, DL Consulting Ltd.
287# @note There were several bugs regarding using directories other than
288# "import" or "archives" during import and build quashed. - John
289# Thompson, DL Consulting Ltd.
290#
291# @param $incremental If true indicates this build should not regenerate all
292# the index and metadata files, and should instead just
293# append the information found in the archives directory
294# to the existing files. If this requires some complex
295# work so as to correctly insert into a classifier so be
296# it. Of course none of this is done here - instead the
297# incremental argument is passed to the document
298# processor.
299#
300sub main
301{
302 # command line args
303 # - new argument to allow control of index level [jmt12]
304 my ($verbosity, $archivedir, $cachedir, $builddir, $site, $maxdocs,
305 $debug, $mode, $indexname, $removeold, $keepold,
306 $incremental, $incremental_mode,
307 $remove_empty_classifications,
308 $collectdir, $build, $type, $textindex,
309 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
310 $sections_index_document_metadata, $maxnumeric, $indexlevel,
311 $parallel);
312
313 my $xml = 0;
314 my $hashParsingResult = {};
315 # general options available to all plugins
316 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
317
318 # If parse returns -1 then something has gone wrong
319 if ($intArgLeftinAfterParsing == -1)
320 {
321 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
322 die "\n";
323 }
324
325 foreach my $strVariable (keys %$hashParsingResult)
326 {
327 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
328 }
329
330 # If $language has been specified, load the appropriate resource bundle
331 # (Otherwise, the default resource bundle will be loaded automatically)
332 if ($language && $language =~ /\S/) {
333 &gsprintf::load_language_specific_resource_bundle($language);
334 }
335
336 if ($xml) {
337 &PrintUsage::print_xml_usage($options);
338 print "\n";
339 return;
340 }
341
342 if ($gli) { # the gli wants strings to be in UTF-8
343 &gsprintf::output_strings_in_UTF8;
344 }
345
346 # now check that we had exactly one leftover arg, which should be
347 # the collection name. We don't want to do this earlier, cos
348 # -xml arg doesn't need a collection name
349 # Or if the user specified -h, then we output the usage also
350 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
351 {
352 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
353 die "\n";
354 }
355
356 $textindex = "";
357 my $close_out = 0;
358 if ($out !~ /^(STDERR|STDOUT)$/i) {
359 open (OUT, ">$out") ||
360 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
361 $out = "buildcol::OUT";
362 $close_out = 1;
363 }
364 $out->autoflush(1);
365
366 # get and check the collection
367 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
368 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
369 die "\n";
370 }
371
372 if ($faillog eq "") {
373 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
374 }
375 # note that we're appending to the faillog here (import.pl clears it each time)
376 # this could potentially create a situation where the faillog keeps being added
377 # to over multiple builds (if the import process is being skipped)
378 open (FAILLOG, ">>$faillog") ||
379 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
380 $faillog = 'buildcol::FAILLOG';
381 $faillog->autoflush(1);
382
383 # Don't know why this didn't already happen, but now collection specific
384 # classify and plugins directory also added to include path
385 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib"); # [jmt12]
386 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify"); # [jmt12]
387 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins"); # [jmt12]
388
389 # Read in the collection configuration file.
390 my ($collectcfg, $buildtype, $orthogonalbuildtypes);
391 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
392 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
393
394 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
395 if (!defined($collectcfg->{'infodbtype'}))
396 {
397 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
398 }
399 # sanity check - you currently can't have SQLite as the infodb while
400 # asking for a parallel build
401 elsif ($collectcfg->{'infodbtype'} eq 'sqlite' && $parallel)
402 {
403 print STDERR "WARNING: Parallel builds not current supported by SQLite - reverting to serial build\n";
404 $parallel = 0;
405 }
406
407 if ($verbosity !~ /\d+/) {
408 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
409 $verbosity = $collectcfg->{'verbosity'};
410 } else {
411 $verbosity = 2; # the default
412 }
413 }
414 # we use searchtype for determining buildtype, but for old versions, use buildtype
415 if (defined $collectcfg->{'buildtype'}) {
416 $buildtype = $collectcfg->{'buildtype'};
417 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
418 $buildtype = "mgpp";
419 } else {
420 $buildtype = "mg"; #mg is the default
421 }
422
423 if (defined $collectcfg->{'orthogonalbuildtypes'}) {
424 $orthogonalbuildtypes = $collectcfg->{'orthogonalbuildtypes'};
425 }
426
427 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
428 $archivedir = $collectcfg->{'archivedir'};
429 }
430 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
431 $cachedir = $collectcfg->{'cachedir'};
432 }
433 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
434 $builddir = $collectcfg->{'builddir'};
435 }
436 if ($maxdocs !~ /\-?\d+/) {
437 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
438 $maxdocs = $collectcfg->{'maxdocs'};
439 } else {
440 $maxdocs = -1; # the default
441 }
442 }
443 if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
444 $maxnumeric = $collectcfg->{'maxnumeric'};
445 }
446
447 if ($maxnumeric < 4 || $maxnumeric > 512) {
448 $maxnumeric = 4;
449 }
450
451 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
452 $debug = 1;
453 }
454 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
455 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
456 $mode = $collectcfg->{'mode'};
457 } else {
458 $mode = "all"; # the default
459 }
460 }
461 # - 'index' doesn't make it through parsing so I renamed this option
462 # 'indexname' [jmt12]
463 if (defined $collectcfg->{'indexname'} && $indexname eq "")
464 {
465 $indexname = $collectcfg->{'indexname'};
466 }
467 # - we may also define the index level to build now
468 if (defined $collectcfg->{'indexlevel'} && $indexlevel eq "")
469 {
470 $indexlevel = $collectcfg->{'indexlevel'};
471 }
472 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
473 if ($collectcfg->{'no_text'} =~ /^true$/i) {
474 $no_text = 1;
475 }
476 }
477 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
478 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
479 $no_strip_html = 1;
480 }
481 }
482 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
483 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
484 $remove_empty_classifications = 1;
485 }
486 }
487
488 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
489 $textindex = $collectcfg->{'textcompress'};
490 }
491 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
492 $gli = 1;
493 }
494
495 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
496 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
497 }
498
499 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
500 $sections_index_document_metadata = "never";
501 }
502
503 ($removeold, $keepold, $incremental, $incremental_mode)
504 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
505 $incremental, "building",
506 $collectcfg);
507
508 $gli = 0 unless defined $gli;
509
510 # New argument to track whether build is incremental
511 $incremental = 0 unless defined $incremental;
512
513 print STDERR "<Build>\n" if $gli;
514
515 #set the text index
516 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
517 if ($textindex eq "") {
518 $textindex = "text";
519 }
520 }
521 else {
522 $textindex = "section:text";
523 }
524
525 # fill in the default archives and building directories if none
526 # were supplied, turn all \ into / and remove trailing /
527
528 my ($realarchivedir, $realbuilddir);
529 # Modified so that the archivedir, if provided as an argument, is made
530 # absolute if it isn't already
531 if ($archivedir eq "")
532 {
533 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
534 }
535 else
536 {
537 $archivedir = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
538 }
539 # End Mod
540 $archivedir =~ s/[\\\/]+/\//g;
541 $archivedir =~ s/\/$//;
542
543 if ($builddir eq "") {
544 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building");
545 if ($incremental) {
546 &gsprintf($out, "{buildcol.incremental_default_builddir}\n");
547 }
548 }
549 $builddir =~ s/[\\\/]+/\//g;
550 $builddir =~ s/\/$//;
551
552 # update the archive cache if needed
553 if ($cachedir) {
554 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
555 if ($verbosity >= 1);
556
557 $cachedir =~ s/[\\\/]+$//;
558 $cachedir .= "/collect/$collection" unless
559 $cachedir =~ /collect\/$collection/;
560
561 $realarchivedir = "$cachedir/archives";
562 $realbuilddir = "$cachedir/building";
563 &util::mk_all_dir ($realarchivedir);
564 &util::mk_all_dir ($realbuilddir);
565 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
566
567 } else {
568 $realarchivedir = $archivedir;
569 $realbuilddir = $builddir;
570 }
571
572 # build it in realbuilddir
573 &util::mk_all_dir ($realbuilddir);
574
575 my ($buildertype, $builderdir, $builder);
576 # if a builder class has been created for this collection, use it
577 # otherwise, use the mg or mgpp builder
578 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
579 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
580 $buildertype = "custombuilder";
581 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
582 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
583 $buildertype = "custombuilder";
584 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
585 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
586 $buildertype = "${collection}builder";
587 } else {
588
589 $builderdir = undef;
590 if ($buildtype ne "") {
591 # caters for extension-based build types, such as 'solr'
592 $buildertype = $buildtype."builder";
593 }
594 else {
595 # Default to mgpp
596 $buildertype = "mgppbuilder";
597 }
598 }
599 # check for extension specific builders
600 # (that will then be run after main builder.pm
601
602 my @builderdir_list = ($builderdir);
603 my @buildertype_list = ($buildertype);
604
605 if (defined $orthogonalbuildtypes) {
606 foreach my $obt (@$orthogonalbuildtypes) {
607
608 push(@builderdir_list,undef); # rely on @INC to find it
609 push(@buildertype_list,$obt."Builder");
610 }
611 }
612
613 # Set up array of the main builder.pm, followed by any ones
614 # from the extension folders
615
616 my $num_builders = scalar(@buildertype_list);
617 my @builders = ();
618
619 for (my $i=0; $i<$num_builders; $i++) {
620 my $this_builder;
621 my $this_buildertype = $buildertype_list[$i];
622 my $this_builderdir = $builderdir_list[$i];
623
624 if ((defined $this_builderdir) && ($this_builderdir ne "")) {
625 require "$this_builderdir/$this_buildertype.pm";
626 }
627 else {
628 require "$this_buildertype.pm";
629 }
630
631 eval("\$this_builder = new $this_buildertype(\$site, \$collection, " .
632 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
633 "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
634 "\$remove_empty_classifications, " .
635 "\$out, \$no_text, \$faillog, \$gli)");
636 die "$@" if $@;
637
638 push(@builders,$this_builder);
639 }
640
641 # Init phase for builders
642 for (my $i=0; $i<$num_builders; $i++) {
643 my $this_buildertype = $buildertype_list[$i];
644 my $this_builderdir = $builderdir_list[$i];
645 my $this_builder = $builders[$i];
646
647 $this_builder->init();
648 $this_builder->set_maxnumeric($maxnumeric);
649
650 if (($this_buildertype eq "mgppbuilder") && $no_strip_html) {
651 $this_builder->set_strip_html(0);
652 }
653 if ($sections_index_document_metadata ne "never") {
654 $this_builder->set_sections_index_document_metadata($sections_index_document_metadata);
655 }
656 }
657
658 # Run the requested passes
659
660 # Parallel Building Support
661 # - if parallel building is requested then we subvert the normal 'all' mode
662 # process, insert attempting to create an XML 'recipe' for building this
663 # collection. We then pass this recipe to an Open MPI augmented compiled
664 # executable (which will in turn make multiple calls back to buildcol.pl
665 # according to the instructions in the recipe)!
666 if ($parallel)
667 {
668 print $out "*** parallel building\n";
669 # Some infodb modes (namely GDBMServer at the moment) need to open the
670 # connection to the database in such a way that it persists over the
671 # child threads. We do this by adding a dummy call to build the file path
672 # to archiveinf-doc as it is the database in question. The '1' at the end
673 # means launch the server... it will then persist until this block passes
674 # out of scope (presumably after all the child mpi processes are done)
675 my $arcinfo_doc_filename = &dbutil::get_infodb_file_path($collectcfg->{'infodbtype'}, "archiveinf-doc", $archivedir, 1);
676
677 # we initially create the recipe as a datastructure to make it easier for
678 # each builder to determine what has already been defined
679 # - each step of the recipe will have a command as a string and a (possibly
680 # empty) array of steps that depend on this step (possibly recursive)
681 print $out "Generating build 'recipe'\n";
682 my $recipe = [];
683 # pass to each builder to have it populated with appropriate commands
684 map { local $_=$_; $_->prepare_build_recipe($collection, $recipe); } @builders;
685 # now write the recipe to an XML file, resolving any path macros
686 my $max_parallel_tasks = scalar(@{$recipe});
687 my $xml_lines = ();
688 push(@{$xml_lines},'<?xml version="1.0" standalone="no" ?>');
689 push(@{$xml_lines},'<Recipe>');
690 foreach my $item (@{$recipe})
691 {
692 my $max_parallel_child_tasks = &print_recipe($xml_lines, $item);
693 if ($max_parallel_child_tasks > $max_parallel_tasks)
694 {
695 $max_parallel_tasks = $max_parallel_child_tasks;
696 }
697 }
698 push(@{$xml_lines}, '</Recipe>');
699 my $recipe_path = &util::get_tmp_filename('.xml');
700 open(XMLOUT, ">:utf8", $recipe_path) or die("Error! Failed to open recipe file for writing: " . $recipe_path . "\nReason: " . $!);
701 print XMLOUT join("\n", @{$xml_lines});
702 close(XMLOUT);
703 # determine the 'optimal' number of threads (based on number of processor
704 # cores and number of indexes)
705 my $number_of_threads = $max_parallel_tasks + 1; # any more is waste
706 my $number_of_cores = `grep "processor" /proc/cpuinfo | wc -l`;
707 $number_of_cores =~ s/\r?\n//g;
708 print $out "Calculating optimal threads => Max parallel tasks: $max_parallel_tasks, Number of cores: $number_of_cores\n";
709 if ($number_of_cores =~ /^\d+$/ && $max_parallel_tasks > $number_of_cores)
710 {
711 # optimal threads for processor bound load (even though it's probably
712 # IO load that has us bound)
713 $number_of_threads = $number_of_cores + 1;
714 }
715 print $out "Optimal threads: " . $number_of_threads . "\n";
716 # call mpibuildcol executable using mpirun and passing path to recipe
717 #rint "The path *in* Perl: " . $ENV{'PATH'} . "\n";
718 my $mpirun_cmd = 'mpirun -n ' . $number_of_threads . ' mpibuildcol "' . $recipe_path . '"';
719 #$mpirun_cmd = 'echo $PATH; echo $PERLLIB;';
720 print $out "Running command: " . $mpirun_cmd . "\n";
721 print `$mpirun_cmd`;
722 # clean up recipe
723 #unlink($recipe_path);
724 }
725 # Normal mode - run compress text, followed by index building, followed by
726 # infodb
727 elsif ($mode =~ /^all$/i) {
728
729 # 'map' modifies the elements of the original array, so calling
730 # methods -- as done below -- will cause (by default) @builders
731 # to be changed to whatever these functions return (which is *not*
732 # what we want -- we want to leave the values unchanged)
733 # => Use 'local' (dynamic scoping) to give each 'map' call its
734 # own local copy This could also be done with:
735 # (my $new =$_)->method(); $new
736 # but is a bit more cumbersome to write
737
738 map { local $_=$_; $_->compress_text($textindex); } @builders;
739 # - note we pass the required indexname and indexlevel (if specified)
740 # to the processor
741 map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
742 map { local $_=$_; $_->make_infodatabase(); } @builders;
743 map { local $_=$_; $_->collect_specific(); } @builders;
744 } elsif ($mode =~ /^compress_text$/i) {
745 map { local $_=$_; $_->compress_text($textindex); } @builders;
746 } elsif ($mode =~ /^build_index$/i) {
747 map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
748 } elsif ($mode =~ /^infodb$/i) {
749 map { local $_=$_; $_->make_infodatabase(); } @builders;
750 } else {
751 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
752 }
753
754 if (!$debug && !$parallel) {
755 map {local $_=$_; $_->make_auxiliary_files(); } @builders;
756 }
757 map {local $_=$_; $_->deinit(); } @builders;
758
759 if (($realbuilddir ne $builddir) && !$debug) {
760 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
761 if ($verbosity >= 1);
762 &util::rm_r ($builddir);
763 &util::cp_r ($realbuilddir, $builddir);
764 }
765
766 close OUT if $close_out;
767 close FAILLOG;
768
769 print STDERR "</Build>\n" if $gli;
770}
771
772sub print_recipe
773{
774 my ($xml_lines, $item) = @_;
775 my $max_parallel_tasks = 0;
776
777 # start building up the command in our xml buffer
778 push(@{$xml_lines}, '<Task>');
779 my $command = $item->{'command'};
780 $command =~ s/&/&amp;/g;
781 $command =~ s/</&lt;/g;
782 $command =~ s/>/&gt;/g;
783 push(@{$xml_lines}, '<Command>' . $command . '</Command>');
784
785 # - print children before closing task
786 if (defined $item->{'children'})
787 {
788 $max_parallel_tasks = scalar(@{$item->{'children'}});
789
790 foreach my $child_item (@{$item->{'children'}})
791 {
792 my $max_parallel_child_tasks = &print_recipe($xml_lines, $child_item);
793 if ($max_parallel_child_tasks > $max_parallel_tasks)
794 {
795 $max_parallel_tasks = $max_parallel_child_tasks;
796 }
797 }
798 }
799 # - now we can close the task having printed nested children
800 push(@{$xml_lines},'</Task>');
801 # done
802 return $max_parallel_tasks;
803}
Note: See TracBrowser for help on using the repository browser.