source: trunk/gsdl/bin/script/buildcol.pl@ 11747

Last change on this file since 11747 was 10472, checked in by kjdon, 19 years ago

added new option sections_index_document_metadata never|always|unless_section_metadata_exists - at section level can index document level metadata

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl -- This program will build a particular collection
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# 11/04/03 Added usage datastructure - John Thompson
29
30package buildcol;
31
32BEGIN {
33 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
34 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
36 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
37 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
38 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
39}
40
41use colcfg;
42use util;
43use scriptutil;
44use FileHandle;
45use gsprintf;
46use printusage;
47use parse2;
48
49use strict;
50no strict 'refs'; # allow filehandles to be variables and vice versa
51no strict 'subs'; # allow barewords (eg STDERR) as function arguments
52
53my $mode_list =
54 [ { 'name' => "all",
55 'desc' => "{buildcol.mode.all}" },
56 { 'name' => "compress_text",
57 'desc' => "{buildcol.mode.compress_text}" },
58 { 'name' => "build_index",
59 'desc' => "{buildcol.mode.build_index}" },
60 { 'name' => "infodb",
61 'desc' => "{buildcol.mode.infodb}" } ];
62
63my $sec_index_list =
64 [ {'name' => "never",
65 'desc' => "{buildcol.sections_index_document_metadata.never}" },
66 {'name' => "always",
67 'desc' => "{buildcol.sections_index_document_metadata.always}" },
68 {'name' => "unless_section_metadata_exists",
69 'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
70 ];
71
72my $arguments =
73 [ { 'name' => "remove_empty_classifications",
74 'desc' => "{buildcol.remove_empty_classifications}",
75 'type' => "flag",
76 'reqd' => "no",
77 'modegli' => "3" },
78 { 'name' => "archivedir",
79 'desc' => "{buildcol.archivedir}",
80 'type' => "string",
81 'reqd' => "no",
82 'hiddengli' => "yes" },
83 { 'name' => "builddir",
84 'desc' => "{buildcol.builddir}",
85 'type' => "string",
86 'reqd' => "no",
87 'hiddengli' => "yes" },
88# { 'name' => "cachedir",
89# 'desc' => "{buildcol.cachedir}",
90# 'type' => "string",
91# 'reqd' => "no" },
92 { 'name' => "collectdir",
93 'desc' => "{buildcol.collectdir}",
94 'type' => "string",
95 # parsearg left "" as default
96 #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
97 'reqd' => "no",
98 'hiddengli' => "yes" },
99 { 'name' => "create_images",
100 'desc' => "{buildcol.create_images}",
101 'type' => "flag",
102 'reqd' => "no",
103 'modegli' => "4" },
104 { 'name' => "debug",
105 'desc' => "{buildcol.debug}",
106 'type' => "flag",
107 'reqd' => "no",
108 'hiddengli' => "yes" },
109 { 'name' => "faillog",
110 'desc' => "{buildcol.faillog}",
111 'type' => "string",
112 # parsearg left "" as default
113 #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
114 'reqd' => "no",
115 'modegli' => "4" },
116 { 'name' => "index",
117 'desc' => "{buildcol.index}",
118 'type' => "string",
119 'reqd' => "no",
120 'modegli' => "3" },
121 { 'name' => "keepold",
122 'desc' => "{buildcol.keepold}",
123 'type' => "flag",
124 'reqd' => "no",
125 'modegli' => "3" },
126 { 'name' => "removeold",
127 'desc' => "{buildcol.removeold}",
128 'type' => "flag",
129 'reqd' => "no",
130 'modegli' => "3" },
131 { 'name' => "language",
132 'desc' => "{scripts.language}",
133 'type' => "string",
134 'reqd' => "no",
135 'modegli' => "4" },
136 { 'name' => "maxdocs",
137 'desc' => "{buildcol.maxdocs}",
138 'type' => "int",
139 'reqd' => "no",
140 'hiddengli' => "yes" },
141 { 'name' => "mode",
142 'desc' => "{buildcol.mode}",
143 'type' => "enum",
144 'list' => $mode_list,
145 # parsearg left "" as default
146# 'deft' => "all",
147 'reqd' => "no",
148 'modegli' => "4" },
149 { 'name' => "no_strip_html",
150 'desc' => "{buildcol.no_strip_html}",
151 'type' => "flag",
152 'reqd' => "no",
153 'modegli' => "4" },
154 { 'name' => "no_text",
155 'desc' => "{buildcol.no_text}",
156 'type' => "flag",
157 'reqd' => "no",
158 'modegli' => "3" },
159 { 'name' => "sections_index_document_metadata",
160 'desc' => "{buildcol.sections_index_document_metadata}",
161 'type' => "enum",
162 'list' => $sec_index_list,
163 'reqd' => "no",
164 'modegli' => "3" },
165 { 'name' => "out",
166 'desc' => "{buildcol.out}",
167 'type' => "string",
168 'deft' => "STDERR",
169 'reqd' => "no",
170 'hiddengli' => "yes" },
171 { 'name' => "verbosity",
172 'desc' => "{buildcol.verbosity}",
173 'type' => "int",
174 # parsearg left "" as default
175 #'deft' => "2",
176 'reqd' => "no",
177 'modegli' => "4" },
178 { 'name' => "gli",
179 'desc' => "",
180 'type' => "flag",
181 'reqd' => "no",
182 'hiddengli' => "yes" },
183 { 'name' => "xml",
184 'desc' => "{scripts.xml}",
185 'type' => "flag",
186 'reqd' => "no",
187 'hiddengli' => "yes" } ];
188
189my $options = { 'name' => "buildcol.pl",
190 'desc' => "{buildcol.desc}",
191 'args' => $arguments };
192
193
194# globals
195my $collection;
196my $configfilename;
197my $out;
198
199sub gsprintf
200{
201 return &gsprintf::gsprintf(@_);
202}
203
204
205
206&main();
207
208sub main
209{
210 # command line args
211 my ($verbosity, $archivedir, $cachedir, $builddir, $maxdocs,
212 $debug, $mode, $indexname, $removeold, $keepold, $remove_empty_classifications,
213 $create_images, $collectdir, $build, $type, $textindex,
214 $no_strip_html, $no_text, $faillog, $gli, $index, $language,
215 $sections_index_document_metadata);
216
217 my $xml = 0;
218
219 my $hashParsingResult = {};
220 my $blnParseFailed = "false";
221 # general options available to all plugins
222 my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
223 # If there are more than one argument left after parsing, it mean user input too many arguments.
224 # Error occoured will return 0
225 if($intArgLeftinAfterParsing > 1)
226 {
227 $blnParseFailed = "true";
228 }
229 if($blnParseFailed eq "true")
230 {
231 print "";
232 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
233 die "\n";
234 }
235 foreach my $strVariable (keys %$hashParsingResult)
236 {
237 eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
238 }
239
240 # If $language has been specified, load the appropriate resource bundle
241 # (Otherwise, the default resource bundle will be loaded automatically)
242 if ($language && $language =~ /\S/) {
243 &gsprintf::load_language_specific_resource_bundle($language);
244 }
245
246 if ($xml) {
247 &PrintUsage::print_xml_usage($options);
248 print "\n";
249 return;
250 }
251
252 if ($gli) { # the gli wants strings to be in UTF-8
253 &gsprintf::output_strings_in_UTF8;
254 }
255
256 $textindex = "";
257 my $close_out = 0;
258 if ($out !~ /^(STDERR|STDOUT)$/i) {
259 open (OUT, ">$out") ||
260 (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
261 $out = "buildcol::OUT";
262 $close_out = 1;
263 }
264 $out->autoflush(1);
265
266 # get and check the collection
267 if (($collection = &util::use_collection(@ARGV, $collectdir)) eq "") {
268 &PrintUsage::print_txt_usage($options, "{buildcol.params}");
269 die "\n";
270 }
271
272 if ($faillog eq "") {
273 $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
274 }
275 # note that we're appending to the faillog here (import.pl clears it each time)
276 # this could potentially create a situation where the faillog keeps being added
277 # to over multiple builds (if the import process is being skipped)
278 open (FAILLOG, ">>$faillog") ||
279 (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
280 $faillog = 'buildcol::FAILLOG';
281 $faillog->autoflush(1);
282
283 unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
284
285 # read the configuration file
286 $configfilename = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "etc", "collect.cfg");
287 my ($collectcfg, $buildtype);
288
289 if (!-e $configfilename) {
290 &gsprintf($out, "{common.cannot_find_cfg_file}\n", $configfilename) && die;
291 }
292
293 $collectcfg = &colcfg::read_collect_cfg ($configfilename);
294
295 if ($verbosity !~ /\d+/) {
296 if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
297 $verbosity = $collectcfg->{'verbosity'};
298 } else {
299 $verbosity = 2; # the default
300 }
301 }
302 # we use searchtype for determining buildtype, but for old versions, use buildtype
303 if (defined $collectcfg->{'buildtype'}) {
304 $buildtype = $collectcfg->{'buildtype'};
305 } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
306 $buildtype = "mgpp";
307 } else {
308 $buildtype = "mg"; #mg is the default
309 }
310 if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
311 $archivedir = $collectcfg->{'archivedir'};
312 }
313 if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
314 $cachedir = $collectcfg->{'cachedir'};
315 }
316 if (defined $collectcfg->{'builddir'} && $builddir eq "") {
317 $builddir = $collectcfg->{'builddir'};
318 }
319 if ($maxdocs !~ /\-?\d+/) {
320 if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
321 $maxdocs = $collectcfg->{'maxdocs'};
322 } else {
323 $maxdocs = -1; # the default
324 }
325 }
326 if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
327 $debug = 1;
328 }
329 if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
330 if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
331 $mode = $collectcfg->{'mode'};
332 } else {
333 $mode = "all"; # the default
334 }
335 }
336 if (defined $collectcfg->{'index'} && $indexname eq "") {
337 $indexname = $collectcfg->{'index'};
338 }
339 if (defined $collectcfg->{'no_text'} && $no_text == 0) {
340 if ($collectcfg->{'no_text'} =~ /^true$/i) {
341 $no_text = 1;
342 }
343 }
344 if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
345 if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
346 $no_strip_html = 1;
347 }
348 }
349 if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
350 if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
351 $remove_empty_classifications = 1;
352 }
353 }
354
355 if (defined $collectcfg->{'create_images'} && $collectcfg->{'create_images'} =~ /^true$/i) {
356 $create_images = 1;
357 }
358 if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
359 $textindex = $collectcfg->{'textcompress'};
360 }
361 if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
362 $gli = 1;
363 }
364
365 if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
366 $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
367 }
368
369 if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
370 $sections_index_document_metadata = "never";
371 }
372
373 ($removeold, $keepold) = &scriptutil::check_removeold_and_keepold($removeold, $keepold, "building", $collectcfg);
374
375 $gli = 0 unless defined $gli;
376
377 print STDERR "<Build>\n" if $gli;
378
379 #set the text index
380 if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
381 if ($textindex eq "") {
382 $textindex = "text";
383 }
384 }
385 else {
386 $textindex = "section:text";
387 }
388
389 # create default images if required
390 if ($create_images) {
391 my $collection_name = $collection;
392 $collection_name = $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'}
393 if defined $collectcfg->{'collectionmeta'}->{'collectionname'}->{'default'};
394 &create_images ($collection_name);
395 }
396
397 # fill in the default archives and building directories if none
398 # were supplied, turn all \ into / and remove trailing /
399
400 my ($realarchivedir, $realbuilddir);
401 $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives") if $archivedir eq "";
402 $archivedir =~ s/[\\\/]+/\//g;
403 $archivedir =~ s/\/$//;
404 $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building") if $builddir eq "";
405 $builddir =~ s/[\\\/]+/\//g;
406 $builddir =~ s/\/$//;
407
408 # update the archive cache if needed
409 if ($cachedir) {
410 &gsprintf($out, "{buildcol.updating_archive_cache}\n")
411 if ($verbosity >= 1);
412
413 $cachedir =~ s/[\\\/]+$//;
414 $cachedir .= "/collect/$collection" unless
415 $cachedir =~ /collect\/$collection/;
416
417 $realarchivedir = "$cachedir/archives";
418 $realbuilddir = "$cachedir/building";
419 &util::mk_all_dir ($realarchivedir);
420 &util::mk_all_dir ($realbuilddir);
421 &util::cachedir ($archivedir, $realarchivedir, $verbosity);
422
423 } else {
424 $realarchivedir = $archivedir;
425 $realbuilddir = $builddir;
426 }
427
428 # build it in realbuilddir
429 &util::mk_all_dir ($realbuilddir);
430
431 my ($buildertype, $builderdir, $builder);
432 # if a builder class has been created for this collection, use it
433 # otherwise, use the mg or mgpp builder
434 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
435 $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
436 $buildertype = "${collection}builder";
437 } else {
438 $builderdir = "$ENV{'GSDLHOME'}/perllib";
439 if ($buildtype eq "lucene") {
440 $buildertype = "lucenebuilder";
441 }
442 elsif ($buildtype eq "mgpp") {
443 $buildertype = "mgppbuilder";
444 }
445 else {
446 $buildertype = "mgbuilder";
447 }
448 }
449
450 require "$builderdir/$buildertype.pm";
451
452 eval("\$builder = new $buildertype(\$collection, " .
453 "\$realarchivedir, \$realbuilddir, \$verbosity, " .
454 "\$maxdocs, \$debug, \$keepold, \$remove_empty_classifications, " .
455 "\$out, \$no_text, \$faillog, \$gli)");
456 die "$@" if $@;
457
458 $builder->init();
459
460 if (($buildertype eq "mgppbuilder") && $no_strip_html) {
461 $builder->set_strip_html(0);
462 }
463 if ($sections_index_document_metadata ne "never") {
464 $builder->set_sections_index_document_metadata($sections_index_document_metadata);
465 }
466
467 if ($mode =~ /^all$/i) {
468 $builder->compress_text($textindex);
469 $builder->build_indexes($indexname);
470 $builder->make_infodatabase();
471 $builder->collect_specific();
472 } elsif ($mode =~ /^compress_text$/i) {
473 $builder->compress_text($textindex);
474 } elsif ($mode =~ /^build_index$/i) {
475 $builder->build_indexes($indexname);
476 } elsif ($mode =~ /^infodb$/i) {
477 $builder->make_infodatabase();
478 } else {
479 (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
480 }
481
482 $builder->make_auxiliary_files() if !$debug;
483 $builder->deinit();
484
485 if (($realbuilddir ne $builddir) && !$debug) {
486 &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
487 if ($verbosity >= 1);
488 &util::rm_r ($builddir);
489 &util::cp_r ($realbuilddir, $builddir);
490 }
491
492 close OUT if $close_out;
493 close FAILLOG;
494
495 print STDERR "</Build>\n" if $gli;
496}
497
498sub create_images {
499 my ($collection_name) = @_;
500
501 my $image_script = &util::filename_cat ($ENV{'GSDLHOME'}, "bin", "script", "gimp", "title_icon-1.2.pl");
502 if (!-e $image_script) {
503 &gsprintf($out, "{buildcol.no_image_script}", $image_script);
504 &gsprintf($out, "{buildcol.no_default_images}\n\n");
505 return;
506 }
507
508 my $imagedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "images");
509
510 &util::mk_all_dir ($imagedir);
511
512 # create the images
513 system ("$image_script -size 1.5 -image_dir \"$imagedir\" -filename $collection.gif -text \"$collection_name\"");
514 system ("$image_script -image_dir \"$imagedir\" -filename ${collection}sm.gif -text \"$collection_name\"");
515
516 # update the collect.cfg configuration file (this will need
517 # to be changed when the config file format changes)
518 if (!open (CFGFILE, $configfilename)) {
519 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
520 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
521 return;
522 }
523
524 my $line = ""; my $file = "";
525 my $found = 0; my $foundsm = 0;
526 while (defined ($line = <CFGFILE>)) {
527 if ($line =~ /collectionmeta\s+iconcollection\s+/) {
528 $line = "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n";
529 $found = 1;
530 } elsif ($line =~ /collectionmeta\s+iconcollectionsmall\s+/) {
531 $line = "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n";
532 $foundsm = 1;
533 }
534 $file .= $line;
535 }
536 close CFGFILE;
537
538 $file .= "collectionmeta iconcollection _httpprefix_/collect/$collection/images/$collection.gif\n" if !$found;
539 $file .= "collectionmeta iconcollectionsmall _httpprefix_/collect/$collection/images/${collection}sm.gif\n" if !$foundsm;
540
541 if (!open (CFGFILE, ">$configfilename")) {
542 &gsprintf($out, "{buildcol.cannot_open_cfg_file}\n", $configfilename);
543 &gsprintf($out, "{buildcol.unlinked_col_images}\n");
544 return;
545 }
546 print CFGFILE $file;
547 close CFGFILE;
548}
Note: See TracBrowser for help on using the repository browser.