source: trunk/gsdl/bin/script/buildcol.pl@ 12364

Last change on this file since 12364 was 12342, checked in by kjdon, 18 years ago

added modegli=3 to maxnumeric option

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# 11/04/03 Added usage datastructure - John Thompson
29
30package buildcol;
31
32BEGIN {
33 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
34 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use colcfg;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf;
46use printusage;
47use parse2;
48
49use strict;
50no strict 'refs'; # allow filehandles to be variables and vice versa
51no strict 'subs'; # allow barewords (eg STDERR) as function arguments
52
53my $mode_list =
54 [ { 'name' => "all",
55 'desc' => "{buildcol.mode.all}" },
56 { 'name' => "compress_text",
57 'desc' => "{buildcol.mode.compress_text}" },
58 { 'name' => "build_index",
59 'desc' => "{buildcol.mode.build_index}" },
60 { 'name' => "infodb",
61 'desc' => "{buildcol.mode.infodb}" } ];
62
63my $sec_index_list =
64 [ {'name' => "never",
65 'desc' => "{buildcol.sections_index_document_metadata.never}" },
66 {'name' => "always",
67 'desc' => "{buildcol.sections_index_document_metadata.always}" },
68 {'name' => "unless_section_metadata_exists",
69 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
70 ];
71
72my $arguments =
73 [ { 'name' => "remove_empty_classifications",
74 'desc' => "{buildcol.remove_empty_classifications}",
75 'type' => "flag",
76 'reqd' => "no",
77 'modegli' => "3" },
78 { 'name' => "archivedir",
79 'desc' => "{buildcol.archivedir}",
80 'type' => "string",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83 { 'name' => "builddir",
84 'desc' => "{buildcol.builddir}",
85 'type' => "string",
86 'reqd' => "no",
87 'hiddengli' => "yes" },
88# { 'name' => "cachedir",
89# 'desc' => "{buildcol.cachedir}",
90# 'type' => "string",
91# 'reqd' => "no" },
92 { 'name' => "collectdir",
93 'desc' => "{buildcol.collectdir}",
94 'type' => "string",
95 # parsearg left "" as default
96 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "create_images",
100 'desc' => "{buildcol.create_images}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "4" },
104 { 'name' => "debug",
105 'desc' => "{buildcol.debug}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes" },
109 { 'name' => "faillog",
110 'desc' => "{buildcol.faillog}",
111 'type' => "string",
112 # parsearg left "" as default
113 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
114 'reqd' => "no",
115 'modegli' => "4" },
116 { 'name' => "index",
117 'desc' => "{buildcol.index}",
118 'type' => "string",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "keepold",
122 'desc' => "{buildcol.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'modegli' => "3" },
126 { 'name' => "removeold",
127 'desc' => "{buildcol.removeold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'modegli' => "3" },
131 { 'name' => "language",
132 'desc' => "{scripts.language}",
133 'type' => "string",
134 'reqd' => "no",
135 'modegli' => "4" },
136 { 'name' => "maxdocs",
137 'desc' => "{buildcol.maxdocs}",
138 'type' => "int",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "maxnumeric",
142 'desc' => "{buildcol.maxnumeric}",
143 'type' => "int",
144 'reqd' => "no",
145 'deft' => "4",
146 'range' => "4,512",
147 'modegli' => "3" },
148 { 'name' => "mode",
149 'desc' => "{buildcol.mode}",
150 'type' => "enum",
151 'list' => $mode_list,
152 # parsearg left "" as default
153# 'deft' => "all",
154 'reqd' => "no",
155 'modegli' => "4" },
156 { 'name' => "no_strip_html",
157 'desc' => "{buildcol.no_strip_html}",
158 'type' => "flag",
159 'reqd' => "no",
160 'modegli' => "4" },
161 { 'name' => "no_text",
162 'desc' => "{buildcol.no_text}",
163 'type' => "flag",
164 'reqd' => "no",
165 'modegli' => "3" },
166 { 'name' => "sections_index_document_metadata",
167 'desc' => "{buildcol.sections_index_document_metadata}",
168 'type' => "enum",
169 'list' => $sec_index_list,
170 'reqd' => "no",
171 'modegli' => "3" },
172 { 'name' => "out",
173 'desc' => "{buildcol.out}",
174 'type' => "string",
175 'deft' => "STDERR",
176 'reqd' => "no",
177 'hiddengli' => "yes" },
178 { 'name' => "verbosity",
179 'desc' => "{buildcol.verbosity}",
180 'type' => "int",
181 # parsearg left "" as default
182 #'deft' => "2",
183 'reqd' => "no",
184 'modegli' => "4" },
185 { 'name' => "gli",
186 'desc' => "",
187 'type' => "flag",
188 'reqd' => "no",
189 'hiddengli' => "yes" },
190 { 'name' => "xml",
191 'desc' => "{scripts.xml}",
192 'type' => "flag",
193 'reqd' => "no",
194 'hiddengli' => "yes" } ];
195
196my $options = { 'name' => "buildcol.pl",
197 'desc' => "{buildcol.desc}",
198 'args' => $arguments };
199
200
201# globals
202my $collection;
203my $configfilename;
204my $out;
205
206sub gsprintf
207{
208 return &gsprintf::gsprintf(@_);
209}
210
211
212
213&main();
214
215sub main
216{
217 # command line args
218 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
219 $debug, $mode, $indexname, $removeold, $keepold, $remove_empty_classifications,
220 $create_images, $collectdir, $build, $type, $textindex,
221 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
222 $sections_index_document_metadata, $maxnumeric);
223
224 my $xml = 0;
225
226 my $hashParsingResult = {};
227 my $blnParseFailed = "false";
228 # general options available to all plugins
229 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
230 # If there are more than one argument left after parsing, it mean user input too many arguments.
231 # Error occoured will return 0
232 if($intArgLeftinAfterParsing > 1)
233 {
234 $blnParseFailed = "true";
235 }
236 if($blnParseFailed eq "true")
237 {
238 print "";
239 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
240 die "\n";
241 }
242 foreach my $strVariable (keys %$hashParsingResult)
243 {
244 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
245 }
246
247 # If $language has been specified, load the appropriate resource bundle
248 # (Otherwise, the default resource bundle will be loaded automatically)
249 if ($language && $language =~ /\S/) {
250 &gsprintf::load_language_specific_resource_bundle($language);
251 }
252
253 if ($xml) {
254 &PrintUsage::print_xml_usage($options);
255 print "\n";
256 return;
257 }
258
259 if ($gli) { # the gli wants strings to be in UTF-8
260 &gsprintf::output_strings_in_UTF8;
261 }
262
263 $textindex = "";
264 my $close_out = 0;
265 if ($out !~ /^(STDERR|STDOUT)$/i) {
266 open (OUT, ">$out") ||
267 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
268 $out = "buildcol::OUT";
269 $close_out = 1;
270 }
271 $out->autoflush(1);
272
273 # get and check the collection
274 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
275 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
276 die "\n";
277 }
278
279 if ($faillog eq "") {
280 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
281 }
282 # note that we're appending to the faillog here (import.pl clears it each time)
283 # this could potentially create a situation where the faillog keeps being added
284 # to over multiple builds (if the import process is being skipped)
285 open (FAILLOG, ">>$faillog") ||
286 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
287 $faillog = 'buildcol::FAILLOG';
288 $faillog->autoflush(1);
289
290 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
291
292 # read the configuration file
293 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
294 my ($collectcfg, $buildtype);
295
296 if (!-e $configfilename) {
297 &gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die;
298 }
299
300 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
301
302 if ($verbosity !~ /\d+/) {
303 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
304 $verbosity = $collectcfg->{'verbosity'};
305 } else {
306 $verbosity = 2; # the default
307 }
308 }
309 # we use searchtype for determining buildtype, but for old versions, use buildtype
310 if (defined $collectcfg->{'buildtype'}) {
311 $buildtype = $collectcfg->{'buildtype'};
312 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
313 $buildtype = "mgpp";
314 } else {
315 $buildtype = "mg"; #mg is the default
316 }
317 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
318 $archivedir = $collectcfg->{'archivedir'};
319 }
320 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
321 $cachedir = $collectcfg->{'cachedir'};
322 }
323 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
324 $builddir = $collectcfg->{'builddir'};
325 }
326 if ($maxdocs !~ /\-?\d+/) {
327 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
328 $maxdocs = $collectcfg->{'maxdocs'};
329 } else {
330 $maxdocs = -1; # the default
331 }
332 }
333 if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
334 $maxnumeric = $collectcfg->{'maxnumeric'};
335 }
336
337 if ($maxnumeric < 4 || $maxnumeric > 512) {
338 $maxnumeric = 4;
339 }
340
341 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
342 $debug = 1;
343 }
344 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
345 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
346 $mode = $collectcfg->{'mode'};
347 } else {
348 $mode = "all"; # the default
349 }
350 }
351 if (defined $collectcfg->{'index'} && $indexname eq "") {
352 $indexname = $collectcfg->{'index'};
353 }
354 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
355 if ($collectcfg->{'no_text'} =~ /^true$/i) {
356 $no_text = 1;
357 }
358 }
359 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
360 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
361 $no_strip_html = 1;
362 }
363 }
364 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
365 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
366 $remove_empty_classifications = 1;
367 }
368 }
369
370
371 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
372 $create_images = 1;
373 }
374 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
375 $textindex = $collectcfg->{'textcompress'};
376 }
377 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
378 $gli = 1;
379 }
380
381 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
382 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
383 }
384
385 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
386 $sections_index_document_metadata = "never";
387 }
388
389 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, "building", $collectcfg);
390
391 $gli = 0 unless defined $gli;
392
393 print STDERR "<Build>\n" if $gli;
394
395 #set the text index
396 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
397 if ($textindex eq "") {
398 $textindex = "text";
399 }
400 }
401 else {
402 $textindex = "section:text";
403 }
404
405 # create default images if required
406 if ($create_images) {
407 my $collection_name = $collection;
408 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'}
409 if defined $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'};
410 &create_images ($collection_name);
411 }
412
413 # fill in the default archives and building directories if none
414 # were supplied, turn all \ into / and remove trailing /
415
416 my ($realarchivedir, $realbuilddir);
417 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
418 $archivedir =~ s/[\\\/]+/\//g;
419 $archivedir =~ s/\/$//;
420 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
421 $builddir =~ s/[\\\/]+/\//g;
422 $builddir =~ s/\/$//;
423
424 # update the archive cache if needed
425 if ($cachedir) {
426 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
427 if ($verbosity >= 1);
428
429 $cachedir =~ s/[\\\/]+$//;
430 $cachedir .= "/collect/$collection" unless
431 $cachedir =~ /collect\/$collection/;
432
433 $realarchivedir = "$cachedir/archives";
434 $realbuilddir = "$cachedir/building";
435 &util::mk_all_dir ($realarchivedir);
436 &util::mk_all_dir ($realbuilddir);
437 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
438
439 } else {
440 $realarchivedir = $archivedir;
441 $realbuilddir = $builddir;
442 }
443
444 # build it in realbuilddir
445 &util::mk_all_dir ($realbuilddir);
446
447 my ($buildertype, $builderdir, $builder);
448 # if a builder class has been created for this collection, use it
449 # otherwise, use the mg or mgpp builder
450 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
451 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
452 $buildertype = "${collection}builder";
453 } else {
454 $builderdir = "$ENV{'GSDLHOME'}/perllib";
455 if ($buildtype eq "lucene") {
456 $buildertype = "lucenebuilder";
457 }
458 elsif ($buildtype eq "mgpp") {
459 $buildertype = "mgppbuilder";
460 }
461 else {
462 $buildertype = "mgbuilder";
463 }
464 }
465
466 require "$builderdir/$buildertype.pm";
467
468 eval("\$builder = new $buildertype(\$collection, " .
469 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
470 "\$maxdocs, \$debug, \$keepold, \$remove_empty_classifications, " .
471 "\$out, \$no_text, \$faillog, \$gli)");
472 die "$@" if $@;
473
474 $builder->init();
475 $builder->set_maxnumeric($maxnumeric);
476
477 if (($buildertype eq "mgppbuilder") && $no_strip_html) {
478 $builder->set_strip_html(0);
479 }
480 if ($sections_index_document_metadata ne "never") {
481 $builder->set_sections_index_document_metadata($sections_index_document_metadata);
482 }
483
484 if ($mode =~ /^all$/i) {
485 $builder->compress_text($textindex);
486 $builder->build_indexes($indexname);
487 $builder->make_infodatabase();
488 $builder->collect_specific();
489 } elsif ($mode =~ /^compress_text$/i) {
490 $builder->compress_text($textindex);
491 } elsif ($mode =~ /^build_index$/i) {
492 $builder->build_indexes($indexname);
493 } elsif ($mode =~ /^infodb$/i) {
494 $builder->make_infodatabase();
495 } else {
496 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
497 }
498
499 $builder->make_auxiliary_files() if !$debug;
500 $builder->deinit();
501
502 if (($realbuilddir ne $builddir) && !$debug) {
503 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
504 if ($verbosity >= 1);
505 &util::rm_r ($builddir);
506 &util::cp_r ($realbuilddir, $builddir);
507 }
508
509 close OUT if $close_out;
510 close FAILLOG;
511
512 print STDERR "</Build>\n" if $gli;
513}
514
515sub create_images {
516 my ($collection_name) = @_;
517
518 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon-1.2.pl");
519 if (!-e $image_script) {
520 &gsprintf($out, "{buildcol.no_image_script}", $image_script);
521 &gsprintf($out, "{buildcol.no_default_images}\n\n");
522 return;
523 }
524
525 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
526
527 &util::mk_all_dir ($imagedir);
528
529 # create the images
530 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
531 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
532
533 # update the collect.cfg configuration file (this will need
534 # to be changed when the config file format changes)
535 if (!open (CFGFILE, $configfilename)) {
536 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
537 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
538 return;
539 }
540
541 my $line = ""; my $file = "";
542 my $found = 0; my $foundsm = 0;
543 while (defined ($line = <CFGFILE>)) {
544 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
545 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
546 $found = 1;
547 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
548 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
549 $foundsm = 1;
550 }
551 $file .= $line;
552 }
553 close CFGFILE;
554
555 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
556 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
557
558 if (!open (CFGFILE, ">$configfilename")) {
559 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
560 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
561 return;
562 }
563 print CFGFILE $file;
564 close CFGFILE;
565}
Note: See TracBrowser for help on using the repository browser.