source: main/trunk/greenstone2/bin/script/buildcol.pl@ 21822

Last change on this file since 21822 was 21822, checked in by ak19, 14 years ago

Dr Bainbridge has fixed several perl files that depended on perl 5.8 to work and used to fail with Perl 5.10.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.4 KB
Line 
1#!/usr/bin/perl -w
2
3## @file buildcol.pl
4# This program will build a particular collection.
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation; either version 2 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program; if not, write to the Free Software
21# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22#
23# @author New Zealand Digital Library Project unless otherwise stated
24# @copy 1999 New Zealand Digital Library Project
25#
26package buildcol;
27
28BEGIN {
29 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
31 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/XML/XPath");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36
37 if (defined $ENV{'GSDLEXTS'}) {
38 my @extensions = split(/:/,$ENV{'GSDLEXTS'});
39 foreach my $e (@extensions) {
40 my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
41
42 unshift (@INC, "$ext_prefix/perllib");
43 unshift (@INC, "$ext_prefix/perllib/cpan");
44 unshift (@INC, "$ext_prefix/perllib/plugins");
45 unshift (@INC, "$ext_prefix/perllib/classify");
46 }
47 }
48 if (defined $ENV{'GSDL3EXTS'}) {
49 my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
50 foreach my $e (@extensions) {
51 my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
52
53 unshift (@INC, "$ext_prefix/perllib");
54 unshift (@INC, "$ext_prefix/perllib/cpan");
55 unshift (@INC, "$ext_prefix/perllib/plugins");
56 unshift (@INC, "$ext_prefix/perllib/classify");
57 }
58 }
59
60}
61
62use colcfg;
63use dbutil;
64use util;
65use scriptutil;
66use FileHandle;
67use gsprintf;
68use printusage;
69use parse2;
70
71use strict;
72no strict 'refs'; # allow filehandles to be variables and vice versa
73no strict 'subs'; # allow barewords (eg STDERR) as function arguments
74
75
76my $mode_list =
77 [ { 'name' => "all",
78 'desc' => "{buildcol.mode.all}" },
79 { 'name' => "compress_text",
80 'desc' => "{buildcol.mode.compress_text}" },
81 { 'name' => "build_index",
82 'desc' => "{buildcol.mode.build_index}" },
83 { 'name' => "infodb",
84 'desc' => "{buildcol.mode.infodb}" } ];
85
86my $sec_index_list =
87 [ {'name' => "never",
88 'desc' => "{buildcol.sections_index_document_metadata.never}" },
89 {'name' => "always",
90 'desc' => "{buildcol.sections_index_document_metadata.always}" },
91 {'name' => "unless_section_metadata_exists",
92 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
93 ];
94
95my $arguments =
96 [ { 'name' => "remove_empty_classifications",
97 'desc' => "{buildcol.remove_empty_classifications}",
98 'type' => "flag",
99 'reqd' => "no",
100 'modegli' => "2" },
101 { 'name' => "archivedir",
102 'desc' => "{buildcol.archivedir}",
103 'type' => "string",
104 'reqd' => "no",
105 'hiddengli' => "yes" },
106 { 'name' => "builddir",
107 'desc' => "{buildcol.builddir}",
108 'type' => "string",
109 'reqd' => "no",
110 'hiddengli' => "yes" },
111# { 'name' => "cachedir",
112# 'desc' => "{buildcol.cachedir}",
113# 'type' => "string",
114# 'reqd' => "no" },
115 { 'name' => "collectdir",
116 'desc' => "{buildcol.collectdir}",
117 'type' => "string",
118 # parsearg left "" as default
119 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
120 'reqd' => "no",
121 'hiddengli' => "yes" },
122 { 'name' => "site",
123 'desc' => "{buildcol.site}",
124 'type' => "string",
125 'deft' => "",
126 'reqd' => "no",
127 'hiddengli' => "yes" },
128 { 'name' => "debug",
129 'desc' => "{buildcol.debug}",
130 'type' => "flag",
131 'reqd' => "no",
132 'hiddengli' => "yes" },
133 { 'name' => "faillog",
134 'desc' => "{buildcol.faillog}",
135 'type' => "string",
136 # parsearg left "" as default
137 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
138 'reqd' => "no",
139 'modegli' => "3" },
140 { 'name' => "index",
141 'desc' => "{buildcol.index}",
142 'type' => "string",
143 'reqd' => "no",
144 'modegli' => "3" },
145 { 'name' => "incremental",
146 'desc' => "{buildcol.incremental}",
147 'type' => "flag",
148 'hiddengli' => "yes" },
149 { 'name' => "keepold",
150 'desc' => "{buildcol.keepold}",
151 'type' => "flag",
152 'reqd' => "no",
153 #'modegli' => "3",
154 'hiddengli' => "yes" },
155 { 'name' => "removeold",
156 'desc' => "{buildcol.removeold}",
157 'type' => "flag",
158 'reqd' => "no",
159 #'modegli' => "3",
160 'hiddengli' => "yes" },
161 { 'name' => "language",
162 'desc' => "{scripts.language}",
163 'type' => "string",
164 'reqd' => "no",
165 'modegli' => "3" },
166 { 'name' => "maxdocs",
167 'desc' => "{buildcol.maxdocs}",
168 'type' => "int",
169 'reqd' => "no",
170 'hiddengli' => "yes" },
171 { 'name' => "maxnumeric",
172 'desc' => "{buildcol.maxnumeric}",
173 'type' => "int",
174 'reqd' => "no",
175 'deft' => "4",
176 'range' => "4,512",
177 'modegli' => "3" },
178 { 'name' => "mode",
179 'desc' => "{buildcol.mode}",
180 'type' => "enum",
181 'list' => $mode_list,
182 # parsearg left "" as default
183# 'deft' => "all",
184 'reqd' => "no",
185 'modegli' => "3" },
186 { 'name' => "no_strip_html",
187 'desc' => "{buildcol.no_strip_html}",
188 'type' => "flag",
189 'reqd' => "no",
190 'modegli' => "3" },
191 { 'name' => "no_text",
192 'desc' => "{buildcol.no_text}",
193 'type' => "flag",
194 'reqd' => "no",
195 'modegli' => "2" },
196 { 'name' => "sections_index_document_metadata",
197 'desc' => "{buildcol.sections_index_document_metadata}",
198 'type' => "enum",
199 'list' => $sec_index_list,
200 'reqd' => "no",
201 'modegli' => "2" },
202 { 'name' => "out",
203 'desc' => "{buildcol.out}",
204 'type' => "string",
205 'deft' => "STDERR",
206 'reqd' => "no",
207 'hiddengli' => "yes" },
208 { 'name' => "verbosity",
209 'desc' => "{buildcol.verbosity}",
210 'type' => "int",
211 # parsearg left "" as default
212 #'deft' => "2",
213 'reqd' => "no",
214 'modegli' => "3" },
215 { 'name' => "gli",
216 'desc' => "",
217 'type' => "flag",
218 'reqd' => "no",
219 'hiddengli' => "yes" },
220 { 'name' => "xml",
221 'desc' => "{scripts.xml}",
222 'type' => "flag",
223 'reqd' => "no",
224 'hiddengli' => "yes" },
225 ];
226
227my $options = { 'name' => "buildcol.pl",
228 'desc' => "{buildcol.desc}",
229 'args' => $arguments };
230
231
232# globals
233my $collection;
234my $configfilename;
235my $out;
236
237# used to signify "gs2"(default) or "gs3"
238my $gs_mode = "gs2";
239
240## @method gsprintf()
241# Print a string to the screen after looking it up from a locale dependant
242# strings file. This function is losely based on the idea of resource
243# bundles as used in Java.
244#
245# @param $error The STDERR stream.
246# @param $text The string containing GS keys that should be replaced with
247# their locale dependant equivilents.
248# @param $out The output stream.
249# @return The locale-based string to output.
250#
251sub gsprintf()
252{
253 return &gsprintf::gsprintf(@_);
254}
255## gsprintf() ##
256
257&main();
258
259## @method main()
260#
261# [Parses up and validates the arguments to the build process before creating
262# the appropriate build process to do the actual work - John]
263#
264# @note Added true incremental support - John Thompson, DL Consulting Ltd.
265# @note There were several bugs regarding using directories other than
266# "import" or "archives" during import and build quashed. - John
267# Thompson, DL Consulting Ltd.
268#
269# @param $incremental If true indicates this build should not regenerate all
270# the index and metadata files, and should instead just
271# append the information found in the archives directory
272# to the existing files. If this requires some complex
273# work so as to correctly insert into a classifier so be
274# it. Of course none of this is done here - instead the
275# incremental argument is passed to the document
276# processor.
277#
278sub main
279{
280 # command line args
281 my ($verbosity, $archivedir, $cachedir, $builddir, $site, $maxdocs,
282 $debug, $mode, $indexname, $removeold, $keepold,
283 $incremental, $incremental_mode,
284 $remove_empty_classifications,
285 $collectdir, $build, $type, $textindex,
286 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
287 $sections_index_document_metadata, $maxnumeric);
288
289 my $xml = 0;
290 my $hashParsingResult = {};
291 # general options available to all plugins
292 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
293
294 # If parse returns -1 then something has gone wrong
295 if ($intArgLeftinAfterParsing == -1)
296 {
297 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
298 die "\n";
299 }
300
301 foreach my $strVariable (keys %$hashParsingResult)
302 {
303 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
304 }
305
306 # If $language has been specified, load the appropriate resource bundle
307 # (Otherwise, the default resource bundle will be loaded automatically)
308 if ($language && $language =~ /\S/) {
309 &gsprintf::load_language_specific_resource_bundle($language);
310 }
311
312 if ($xml) {
313 &PrintUsage::print_xml_usage($options);
314 print "\n";
315 return;
316 }
317
318 if ($gli) { # the gli wants strings to be in UTF-8
319 &gsprintf::output_strings_in_UTF8;
320 }
321
322 # now check that we had exactly one leftover arg, which should be
323 # the collection name. We don't want to do this earlier, cos
324 # -xml arg doesn't need a collection name
325 # Or if the user specified -h, then we output the usage also
326 if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
327 {
328 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
329 die "\n";
330 }
331
332 $textindex = "";
333 my $close_out = 0;
334 if ($out !~ /^(STDERR|STDOUT)$/i) {
335 open (OUT, ">$out") ||
336 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
337 $out = "buildcol::OUT";
338 $close_out = 1;
339 }
340 $out->autoflush(1);
341
342 # get and check the collection
343 if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
344 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
345 die "\n";
346 }
347
348 if ($faillog eq "") {
349 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
350 }
351 # note that we're appending to the faillog here (import.pl clears it each time)
352 # this could potentially create a situation where the faillog keeps being added
353 # to over multiple builds (if the import process is being skipped)
354 open (FAILLOG, ">>$faillog") ||
355 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
356 $faillog = 'buildcol::FAILLOG';
357 $faillog->autoflush(1);
358
359 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
360 # Don't know why this didn't already happen, but now collection specific
361 # classify and plugins directory also added to include path
362 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify");
363 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");
364
365 # Read in the collection configuration file.
366 my ($collectcfg, $buildtype);
367 ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
368 $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
369
370 # If the infodbtype value wasn't defined in the collect.cfg file, use the default
371 if (!defined($collectcfg->{'infodbtype'}))
372 {
373 $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
374 }
375
376 if ($verbosity !~ /\d+/) {
377 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
378 $verbosity = $collectcfg->{'verbosity'};
379 } else {
380 $verbosity = 2; # the default
381 }
382 }
383 # we use searchtype for determining buildtype, but for old versions, use buildtype
384 if (defined $collectcfg->{'buildtype'}) {
385 $buildtype = $collectcfg->{'buildtype'};
386 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
387 $buildtype = "mgpp";
388 } else {
389 $buildtype = "mg"; #mg is the default
390 }
391 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
392 $archivedir = $collectcfg->{'archivedir'};
393 }
394 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
395 $cachedir = $collectcfg->{'cachedir'};
396 }
397 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
398 $builddir = $collectcfg->{'builddir'};
399 }
400 if ($maxdocs !~ /\-?\d+/) {
401 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
402 $maxdocs = $collectcfg->{'maxdocs'};
403 } else {
404 $maxdocs = -1; # the default
405 }
406 }
407 if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
408 $maxnumeric = $collectcfg->{'maxnumeric'};
409 }
410
411 if ($maxnumeric < 4 || $maxnumeric > 512) {
412 $maxnumeric = 4;
413 }
414
415 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
416 $debug = 1;
417 }
418 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
419 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
420 $mode = $collectcfg->{'mode'};
421 } else {
422 $mode = "all"; # the default
423 }
424 }
425 if (defined $collectcfg->{'index'} && $indexname eq "") {
426 $indexname = $collectcfg->{'index'};
427 }
428 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
429 if ($collectcfg->{'no_text'} =~ /^true$/i) {
430 $no_text = 1;
431 }
432 }
433 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
434 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
435 $no_strip_html = 1;
436 }
437 }
438 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
439 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
440 $remove_empty_classifications = 1;
441 }
442 }
443
444 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
445 $textindex = $collectcfg->{'textcompress'};
446 }
447 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
448 $gli = 1;
449 }
450
451 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
452 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
453 }
454
455 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
456 $sections_index_document_metadata = "never";
457 }
458
459 ($removeold, $keepold, $incremental, $incremental_mode)
460 = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
461 $incremental, "building",
462 $collectcfg);
463
464 $gli = 0 unless defined $gli;
465
466 # New argument to track whether build is incremental
467 $incremental = 0 unless defined $incremental;
468
469 print STDERR "<Build>\n" if $gli;
470
471 #set the text index
472 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
473 if ($textindex eq "") {
474 $textindex = "text";
475 }
476 }
477 else {
478 $textindex = "section:text";
479 }
480
481 # fill in the default archives and building directories if none
482 # were supplied, turn all \ into / and remove trailing /
483
484 my ($realarchivedir, $realbuilddir);
485 # Modified so that the archivedir, if provided as an argument, is made
486 # absolute if it isn't already
487 if ($archivedir eq "")
488 {
489 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
490 }
491 else
492 {
493 $archivedir = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
494 }
495 # End Mod
496 $archivedir =~ s/[\\\/]+/\//g;
497 $archivedir =~ s/\/$//;
498
499 if ($builddir eq "") {
500 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building");
501 if ($incremental) {
502 &gsprintf($out, "{buildcol.incremental_default_builddir}\n");
503 }
504 }
505 $builddir =~ s/[\\\/]+/\//g;
506 $builddir =~ s/\/$//;
507
508 # update the archive cache if needed
509 if ($cachedir) {
510 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
511 if ($verbosity >= 1);
512
513 $cachedir =~ s/[\\\/]+$//;
514 $cachedir .= "/collect/$collection" unless
515 $cachedir =~ /collect\/$collection/;
516
517 $realarchivedir = "$cachedir/archives";
518 $realbuilddir = "$cachedir/building";
519 &util::mk_all_dir ($realarchivedir);
520 &util::mk_all_dir ($realbuilddir);
521 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
522
523 } else {
524 $realarchivedir = $archivedir;
525 $realbuilddir = $builddir;
526 }
527
528 # build it in realbuilddir
529 &util::mk_all_dir ($realbuilddir);
530
531 my ($buildertype, $builderdir, $builder);
532 # if a builder class has been created for this collection, use it
533 # otherwise, use the mg or mgpp builder
534 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
535 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
536 $buildertype = "custombuilder";
537 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
538 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
539 $buildertype = "custombuilder";
540 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
541 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
542 $buildertype = "${collection}builder";
543 } else {
544 $builderdir = "$ENV{'GSDLHOME'}/perllib";
545 if ($buildtype eq "lucene") {
546 $buildertype = "lucenebuilder";
547 }
548 elsif ($buildtype eq "mgpp") {
549 $buildertype = "mgppbuilder";
550 }
551 else {
552 $buildertype = "mgbuilder";
553 }
554 }
555
556 require "$builderdir/$buildertype.pm";
557
558 eval("\$builder = new $buildertype(\$collection, " .
559 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
560 "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
561 "\$remove_empty_classifications, " .
562 "\$out, \$no_text, \$faillog, \$gli)");
563 die "$@" if $@;
564
565 $builder->init();
566 $builder->set_maxnumeric($maxnumeric);
567
568 if (($buildertype eq "mgppbuilder") && $no_strip_html) {
569 $builder->set_strip_html(0);
570 }
571 if ($sections_index_document_metadata ne "never") {
572 $builder->set_sections_index_document_metadata($sections_index_document_metadata);
573 }
574
575 if ($mode =~ /^all$/i) {
576 $builder->compress_text($textindex);
577 $builder->build_indexes($indexname);
578 $builder->make_infodatabase();
579 $builder->collect_specific();
580 } elsif ($mode =~ /^compress_text$/i) {
581 $builder->compress_text($textindex);
582 } elsif ($mode =~ /^build_index$/i) {
583 $builder->build_indexes($indexname);
584 } elsif ($mode =~ /^infodb$/i) {
585 $builder->make_infodatabase();
586 } else {
587 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
588 }
589
590 $builder->make_auxiliary_files() if !$debug;
591 $builder->deinit();
592
593 if (($realbuilddir ne $builddir) && !$debug) {
594 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
595 if ($verbosity >= 1);
596 &util::rm_r ($builddir);
597 &util::cp_r ($realbuilddir, $builddir);
598 }
599
600 close OUT if $close_out;
601 close FAILLOG;
602
603 print STDERR "</Build>\n" if $gli;
604}
605## main() ##
606
607
Note: See TracBrowser for help on using the repository browser.