source: trunk/gsdl/bin/script/buildcol.pl@ 12484

Last change on this file since 12484 was 12425, checked in by mdewsnip, 18 years ago

Fixed a bug where buildcol would try to continue when invalid arguments were passed in.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# 11/04/03 Added usage datastructure - John Thompson
29
30package buildcol;
31
32BEGIN {
33 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
34 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use colcfg;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf;
46use printusage;
47use parse2;
48
49use strict;
50no strict 'refs'; # allow filehandles to be variables and vice versa
51no strict 'subs'; # allow barewords (eg STDERR) as function arguments
52
53my $mode_list =
54 [ { 'name' => "all",
55 'desc' => "{buildcol.mode.all}" },
56 { 'name' => "compress_text",
57 'desc' => "{buildcol.mode.compress_text}" },
58 { 'name' => "build_index",
59 'desc' => "{buildcol.mode.build_index}" },
60 { 'name' => "infodb",
61 'desc' => "{buildcol.mode.infodb}" } ];
62
63my $sec_index_list =
64 [ {'name' => "never",
65 'desc' => "{buildcol.sections_index_document_metadata.never}" },
66 {'name' => "always",
67 'desc' => "{buildcol.sections_index_document_metadata.always}" },
68 {'name' => "unless_section_metadata_exists",
69 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
70 ];
71
72my $arguments =
73 [ { 'name' => "remove_empty_classifications",
74 'desc' => "{buildcol.remove_empty_classifications}",
75 'type' => "flag",
76 'reqd' => "no",
77 'modegli' => "3" },
78 { 'name' => "archivedir",
79 'desc' => "{buildcol.archivedir}",
80 'type' => "string",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83 { 'name' => "builddir",
84 'desc' => "{buildcol.builddir}",
85 'type' => "string",
86 'reqd' => "no",
87 'hiddengli' => "yes" },
88# { 'name' => "cachedir",
89# 'desc' => "{buildcol.cachedir}",
90# 'type' => "string",
91# 'reqd' => "no" },
92 { 'name' => "collectdir",
93 'desc' => "{buildcol.collectdir}",
94 'type' => "string",
95 # parsearg left "" as default
96 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "create_images",
100 'desc' => "{buildcol.create_images}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "4" },
104 { 'name' => "debug",
105 'desc' => "{buildcol.debug}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes" },
109 { 'name' => "faillog",
110 'desc' => "{buildcol.faillog}",
111 'type' => "string",
112 # parsearg left "" as default
113 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
114 'reqd' => "no",
115 'modegli' => "4" },
116 { 'name' => "index",
117 'desc' => "{buildcol.index}",
118 'type' => "string",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "keepold",
122 'desc' => "{buildcol.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'modegli' => "3" },
126 { 'name' => "removeold",
127 'desc' => "{buildcol.removeold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'modegli' => "3" },
131 { 'name' => "language",
132 'desc' => "{scripts.language}",
133 'type' => "string",
134 'reqd' => "no",
135 'modegli' => "4" },
136 { 'name' => "maxdocs",
137 'desc' => "{buildcol.maxdocs}",
138 'type' => "int",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "maxnumeric",
142 'desc' => "{buildcol.maxnumeric}",
143 'type' => "int",
144 'reqd' => "no",
145 'deft' => "4",
146 'range' => "4,512",
147 'modegli' => "3" },
148 { 'name' => "mode",
149 'desc' => "{buildcol.mode}",
150 'type' => "enum",
151 'list' => $mode_list,
152 # parsearg left "" as default
153# 'deft' => "all",
154 'reqd' => "no",
155 'modegli' => "4" },
156 { 'name' => "no_strip_html",
157 'desc' => "{buildcol.no_strip_html}",
158 'type' => "flag",
159 'reqd' => "no",
160 'modegli' => "4" },
161 { 'name' => "no_text",
162 'desc' => "{buildcol.no_text}",
163 'type' => "flag",
164 'reqd' => "no",
165 'modegli' => "3" },
166 { 'name' => "sections_index_document_metadata",
167 'desc' => "{buildcol.sections_index_document_metadata}",
168 'type' => "enum",
169 'list' => $sec_index_list,
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "out",
173 'desc' => "{buildcol.out}",
174 'type' => "string",
175 'deft' => "STDERR",
176 'reqd' => "no",
177 'hiddengli' => "yes" },
178 { 'name' => "verbosity",
179 'desc' => "{buildcol.verbosity}",
180 'type' => "int",
181 # parsearg left "" as default
182 #'deft' => "2",
183 'reqd' => "no",
184 'modegli' => "4" },
185 { 'name' => "gli",
186 'desc' => "",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190 { 'name' => "xml",
191 'desc' => "{scripts.xml}",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" } ];
195
196my $options = { 'name' => "buildcol.pl",
197 'desc' => "{buildcol.desc}",
198 'args' => $arguments };
199
200
201# globals
202my $collection;
203my $configfilename;
204my $out;
205
206sub gsprintf
207{
208 return &gsprintf::gsprintf(@_);
209}
210
211
212
213&main();
214
215sub main
216{
217 # command line args
218 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
219 $debug, $mode, $indexname, $removeold, $keepold, $remove_empty_classifications,
220 $create_images, $collectdir, $build, $type, $textindex,
221 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
222 $sections_index_document_metadata, $maxnumeric);
223
224 my $xml = 0;
225
226 my $hashParsingResult = {};
227 my $blnParseFailed = "false";
228 # general options available to all plugins
229 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
230
231 # If there are more than one argument left after parsing, it mean user input too many arguments.
232 # Error occoured will return 0
233 if ($intArgLeftinAfterParsing != 1)
234 {
235 $blnParseFailed = "true";
236 }
237 if($blnParseFailed eq "true")
238 {
239 print "";
240 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
241 die "\n";
242 }
243 foreach my $strVariable (keys %$hashParsingResult)
244 {
245 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
246 }
247
248 # If $language has been specified, load the appropriate resource bundle
249 # (Otherwise, the default resource bundle will be loaded automatically)
250 if ($language && $language =~ /\S/) {
251 &gsprintf::load_language_specific_resource_bundle($language);
252 }
253
254 if ($xml) {
255 &PrintUsage::print_xml_usage($options);
256 print "\n";
257 return;
258 }
259
260 if ($gli) { # the gli wants strings to be in UTF-8
261 &gsprintf::output_strings_in_UTF8;
262 }
263
264 $textindex = "";
265 my $close_out = 0;
266 if ($out !~ /^(STDERR|STDOUT)$/i) {
267 open (OUT, ">$out") ||
268 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
269 $out = "buildcol::OUT";
270 $close_out = 1;
271 }
272 $out->autoflush(1);
273
274 # get and check the collection
275 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
276 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
277 die "\n";
278 }
279
280 if ($faillog eq "") {
281 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
282 }
283 # note that we're appending to the faillog here (import.pl clears it each time)
284 # this could potentially create a situation where the faillog keeps being added
285 # to over multiple builds (if the import process is being skipped)
286 open (FAILLOG, ">>$faillog") ||
287 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
288 $faillog = 'buildcol::FAILLOG';
289 $faillog->autoflush(1);
290
291 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
292
293 # read the configuration file
294 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
295 my ($collectcfg, $buildtype);
296
297 if (!-e $configfilename) {
298 &gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die;
299 }
300
301 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
302
303 if ($verbosity !~ /\d+/) {
304 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
305 $verbosity = $collectcfg->{'verbosity'};
306 } else {
307 $verbosity = 2; # the default
308 }
309 }
310 # we use searchtype for determining buildtype, but for old versions, use buildtype
311 if (defined $collectcfg->{'buildtype'}) {
312 $buildtype = $collectcfg->{'buildtype'};
313 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
314 $buildtype = "mgpp";
315 } else {
316 $buildtype = "mg"; #mg is the default
317 }
318 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
319 $archivedir = $collectcfg->{'archivedir'};
320 }
321 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
322 $cachedir = $collectcfg->{'cachedir'};
323 }
324 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
325 $builddir = $collectcfg->{'builddir'};
326 }
327 if ($maxdocs !~ /\-?\d+/) {
328 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
329 $maxdocs = $collectcfg->{'maxdocs'};
330 } else {
331 $maxdocs = -1; # the default
332 }
333 }
334 if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
335 $maxnumeric = $collectcfg->{'maxnumeric'};
336 }
337
338 if ($maxnumeric < 4 || $maxnumeric > 512) {
339 $maxnumeric = 4;
340 }
341
342 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
343 $debug = 1;
344 }
345 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
346 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
347 $mode = $collectcfg->{'mode'};
348 } else {
349 $mode = "all"; # the default
350 }
351 }
352 if (defined $collectcfg->{'index'} && $indexname eq "") {
353 $indexname = $collectcfg->{'index'};
354 }
355 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
356 if ($collectcfg->{'no_text'} =~ /^true$/i) {
357 $no_text = 1;
358 }
359 }
360 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
361 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
362 $no_strip_html = 1;
363 }
364 }
365 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
366 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
367 $remove_empty_classifications = 1;
368 }
369 }
370
371
372 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
373 $create_images = 1;
374 }
375 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
376 $textindex = $collectcfg->{'textcompress'};
377 }
378 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
379 $gli = 1;
380 }
381
382 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
383 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
384 }
385
386 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
387 $sections_index_document_metadata = "never";
388 }
389
390 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, "building", $collectcfg);
391
392 $gli = 0 unless defined $gli;
393
394 print STDERR "<Build>\n" if $gli;
395
396 #set the text index
397 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
398 if ($textindex eq "") {
399 $textindex = "text";
400 }
401 }
402 else {
403 $textindex = "section:text";
404 }
405
406 # create default images if required
407 if ($create_images) {
408 my $collection_name = $collection;
409 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'}
410 if defined $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'};
411 &create_images ($collection_name);
412 }
413
414 # fill in the default archives and building directories if none
415 # were supplied, turn all \ into / and remove trailing /
416
417 my ($realarchivedir, $realbuilddir);
418 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
419 $archivedir =~ s/[\\\/]+/\//g;
420 $archivedir =~ s/\/$//;
421 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
422 $builddir =~ s/[\\\/]+/\//g;
423 $builddir =~ s/\/$//;
424
425 # update the archive cache if needed
426 if ($cachedir) {
427 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
428 if ($verbosity >= 1);
429
430 $cachedir =~ s/[\\\/]+$//;
431 $cachedir .= "/collect/$collection" unless
432 $cachedir =~ /collect\/$collection/;
433
434 $realarchivedir = "$cachedir/archives";
435 $realbuilddir = "$cachedir/building";
436 &util::mk_all_dir ($realarchivedir);
437 &util::mk_all_dir ($realbuilddir);
438 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
439
440 } else {
441 $realarchivedir = $archivedir;
442 $realbuilddir = $builddir;
443 }
444
445 # build it in realbuilddir
446 &util::mk_all_dir ($realbuilddir);
447
448 my ($buildertype, $builderdir, $builder);
449 # if a builder class has been created for this collection, use it
450 # otherwise, use the mg or mgpp builder
451 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
452 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
453 $buildertype = "${collection}builder";
454 } else {
455 $builderdir = "$ENV{'GSDLHOME'}/perllib";
456 if ($buildtype eq "lucene") {
457 $buildertype = "lucenebuilder";
458 }
459 elsif ($buildtype eq "mgpp") {
460 $buildertype = "mgppbuilder";
461 }
462 else {
463 $buildertype = "mgbuilder";
464 }
465 }
466
467 require "$builderdir/$buildertype.pm";
468
469 eval("\$builder = new $buildertype(\$collection, " .
470 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
471 "\$maxdocs, \$debug, \$keepold, \$remove_empty_classifications, " .
472 "\$out, \$no_text, \$faillog, \$gli)");
473 die "$@" if $@;
474
475 $builder->init();
476 $builder->set_maxnumeric($maxnumeric);
477
478 if (($buildertype eq "mgppbuilder") && $no_strip_html) {
479 $builder->set_strip_html(0);
480 }
481 if ($sections_index_document_metadata ne "never") {
482 $builder->set_sections_index_document_metadata($sections_index_document_metadata);
483 }
484
485 if ($mode =~ /^all$/i) {
486 $builder->compress_text($textindex);
487 $builder->build_indexes($indexname);
488 $builder->make_infodatabase();
489 $builder->collect_specific();
490 } elsif ($mode =~ /^compress_text$/i) {
491 $builder->compress_text($textindex);
492 } elsif ($mode =~ /^build_index$/i) {
493 $builder->build_indexes($indexname);
494 } elsif ($mode =~ /^infodb$/i) {
495 $builder->make_infodatabase();
496 } else {
497 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
498 }
499
500 $builder->make_auxiliary_files() if !$debug;
501 $builder->deinit();
502
503 if (($realbuilddir ne $builddir) && !$debug) {
504 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
505 if ($verbosity >= 1);
506 &util::rm_r ($builddir);
507 &util::cp_r ($realbuilddir, $builddir);
508 }
509
510 close OUT if $close_out;
511 close FAILLOG;
512
513 print STDERR "</Build>\n" if $gli;
514}
515
516sub create_images {
517 my ($collection_name) = @_;
518
519 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon-1.2.pl");
520 if (!-e $image_script) {
521 &gsprintf($out, "{buildcol.no_image_script}", $image_script);
522 &gsprintf($out, "{buildcol.no_default_images}\n\n");
523 return;
524 }
525
526 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
527
528 &util::mk_all_dir ($imagedir);
529
530 # create the images
531 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
532 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
533
534 # update the collect.cfg configuration file (this will need
535 # to be changed when the config file format changes)
536 if (!open (CFGFILE, $configfilename)) {
537 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
538 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
539 return;
540 }
541
542 my $line = ""; my $file = "";
543 my $found = 0; my $foundsm = 0;
544 while (defined ($line = <CFGFILE>)) {
545 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
546 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
547 $found = 1;
548 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
549 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
550 $foundsm = 1;
551 }
552 $file .= $line;
553 }
554 close CFGFILE;
555
556 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
557 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
558
559 if (!open (CFGFILE, ">$configfilename")) {
560 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
561 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
562 return;
563 }
564 print CFGFILE $file;
565 close CFGFILE;
566}
Note: See TracBrowser for help on using the repository browser.