root/main/trunk/greenstone2/bin/script/buildcol.pl @ 22743

Revision 22743, 19.5 KB (checked in by mdewsnip, 10 years ago)

Consistency changes to copyright statement.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# buildcol.pl --
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This program will build a particular collection.
30
31package buildcol;
32
33BEGIN {
34    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
35    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
36    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
37    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
38    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/XML/XPath");
39    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
40    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
41
42    if (defined $ENV{'GSDLEXTS'}) {
43    my @extensions = split(/:/,$ENV{'GSDLEXTS'});
44    foreach my $e (@extensions) {
45        my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
46
47        unshift (@INC, "$ext_prefix/perllib");
48        unshift (@INC, "$ext_prefix/perllib/cpan");
49        unshift (@INC, "$ext_prefix/perllib/plugins");
50        unshift (@INC, "$ext_prefix/perllib/classify");
51    }
52    }
53    if (defined $ENV{'GSDL3EXTS'}) {
54    my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
55    foreach my $e (@extensions) {
56        my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
57
58        unshift (@INC, "$ext_prefix/perllib");
59        unshift (@INC, "$ext_prefix/perllib/cpan");
60        unshift (@INC, "$ext_prefix/perllib/plugins");
61        unshift (@INC, "$ext_prefix/perllib/classify");
62    }
63    }
64
65}
66
67use colcfg;
68use dbutil;
69use util;
70use scriptutil;
71use FileHandle;
72use gsprintf;
73use printusage;
74use parse2;
75
76use strict;
77no strict 'refs'; # allow filehandles to be variables and vice versa
78no strict 'subs'; # allow barewords (eg STDERR) as function arguments
79
80
81my $mode_list =
82    [ { 'name' => "all",
83        'desc' => "{buildcol.mode.all}" },
84      { 'name' => "compress_text",
85        'desc' => "{buildcol.mode.compress_text}" },
86      { 'name' => "build_index",
87        'desc' => "{buildcol.mode.build_index}" },
88      { 'name' => "infodb",
89        'desc' => "{buildcol.mode.infodb}" } ];
90
91my $sec_index_list =
92    [ {'name' => "never",
93       'desc' => "{buildcol.sections_index_document_metadata.never}" },
94      {'name' => "always",
95       'desc' => "{buildcol.sections_index_document_metadata.always}" },
96      {'name' => "unless_section_metadata_exists",
97       'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
98      ];
99
100my $arguments =
101    [ { 'name' => "remove_empty_classifications",
102    'desc' => "{buildcol.remove_empty_classifications}",
103    'type' => "flag",
104    'reqd' => "no",
105    'modegli' => "2" },
106      { 'name' => "archivedir",
107    'desc' => "{buildcol.archivedir}",
108    'type' => "string",
109    'reqd' => "no",
110        'hiddengli' => "yes" },
111      { 'name' => "builddir",
112    'desc' => "{buildcol.builddir}",
113    'type' => "string",
114    'reqd' => "no",
115        'hiddengli' => "yes" },
116#     { 'name' => "cachedir",
117#   'desc' => "{buildcol.cachedir}",
118#   'type' => "string",
119#   'reqd' => "no" },
120      { 'name' => "collectdir",
121    'desc' => "{buildcol.collectdir}",
122    'type' => "string",
123    # parsearg left "" as default
124    #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
125    'reqd' => "no",
126        'hiddengli' => "yes" },
127      { 'name' => "site",
128    'desc' => "{buildcol.site}",
129    'type' => "string",
130    'deft' => "",
131    'reqd' => "no",
132        'hiddengli' => "yes" },
133      { 'name' => "debug",
134    'desc' => "{buildcol.debug}",
135    'type' => "flag",
136    'reqd' => "no",
137        'hiddengli' => "yes" },
138      { 'name' => "faillog",
139    'desc' => "{buildcol.faillog}",
140    'type' => "string",
141    # parsearg left "" as default
142    #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
143    'reqd' => "no",
144    'modegli' => "3" },
145      { 'name' => "index",
146    'desc' => "{buildcol.index}",
147    'type' => "string",
148    'reqd' => "no",
149    'modegli' => "3" },
150      { 'name' => "incremental",
151    'desc' => "{buildcol.incremental}",
152    'type' => "flag",
153    'hiddengli' => "yes" },
154      { 'name' => "keepold",
155    'desc' => "{buildcol.keepold}",
156    'type' => "flag",
157    'reqd' => "no",
158        #'modegli' => "3",
159    'hiddengli' => "yes" },
160      { 'name' => "removeold",
161    'desc' => "{buildcol.removeold}",
162    'type' => "flag",
163    'reqd' => "no",
164    #'modegli' => "3",
165    'hiddengli' => "yes"  },
166      { 'name' => "language",
167    'desc' => "{scripts.language}",
168    'type' => "string",
169    'reqd' => "no",
170    'modegli' => "3" },
171      { 'name' => "maxdocs",
172    'desc' => "{buildcol.maxdocs}",
173    'type' => "int",
174    'reqd' => "no",
175        'hiddengli' => "yes" },
176      { 'name' => "maxnumeric",
177    'desc' => "{buildcol.maxnumeric}",
178    'type' => "int",
179    'reqd' => "no",
180    'deft' => "4",
181    'range' => "4,512",
182    'modegli' => "3" },
183      { 'name' => "mode",
184    'desc' => "{buildcol.mode}",
185    'type' => "enum",
186    'list' => $mode_list,
187    # parsearg left "" as default
188#   'deft' => "all",
189    'reqd' => "no",
190    'modegli' => "3" },
191      { 'name' => "no_strip_html",
192    'desc' => "{buildcol.no_strip_html}",
193    'type' => "flag",
194    'reqd' => "no",
195    'modegli' => "3" },
196      { 'name' => "no_text",
197    'desc' => "{buildcol.no_text}",
198    'type' => "flag",
199    'reqd' => "no",
200    'modegli' => "2" },
201      { 'name' => "sections_index_document_metadata",
202    'desc' => "{buildcol.sections_index_document_metadata}",
203    'type' => "enum",
204    'list' => $sec_index_list,
205    'reqd' => "no",
206    'modegli' => "2" },
207      { 'name' => "out",
208    'desc' => "{buildcol.out}",
209    'type' => "string",
210    'deft' => "STDERR",
211    'reqd' => "no",
212        'hiddengli' => "yes" },
213      { 'name' => "verbosity",
214    'desc' => "{buildcol.verbosity}",
215    'type' => "int",
216    # parsearg left "" as default
217    #'deft' => "2",
218    'reqd' => "no",
219    'modegli' => "3" },
220      { 'name' => "gli",
221    'desc' => "",
222    'type' => "flag",
223    'reqd' => "no",
224    'hiddengli' => "yes" },
225      { 'name' => "xml",
226    'desc' => "{scripts.xml}",
227    'type' => "flag",
228    'reqd' => "no",
229    'hiddengli' => "yes" },
230      ];
231
232my $options = { 'name' => "buildcol.pl",
233        'desc' => "{buildcol.desc}",
234        'args' => $arguments };
235
236
237# globals
238my $collection;
239my $configfilename;
240my $out;
241
242# used to signify "gs2"(default) or "gs3"
243my $gs_mode = "gs2";
244
245## @method gsprintf()
246#  Print a string to the screen after looking it up from a locale dependant
247#  strings file. This function is losely based on the idea of resource
248#  bundles as used in Java.
249#
250#  @param  $error The STDERR stream.
251#  @param  $text The string containing GS keys that should be replaced with
252#                their locale dependant equivilents.
253#  @param  $out The output stream.
254#  @return The locale-based string to output.
255#
256sub gsprintf()
257{
258    return &gsprintf::gsprintf(@_);
259}
260## gsprintf() ##
261
262&main();
263
264## @method main()
265#
266#  [Parses up and validates the arguments to the build process before creating
267#  the appropriate build process to do the actual work - John]
268#
269#  @note Added true incremental support - John Thompson, DL Consulting Ltd.
270#  @note There were several bugs regarding using directories other than
271#        "import" or "archives" during import and build quashed. - John
272#        Thompson, DL Consulting Ltd.
273#
274#  @param  $incremental If true indicates this build should not regenerate all
275#                       the index and metadata files, and should instead just
276#                       append the information found in the archives directory
277#                       to the existing files. If this requires some complex
278#                       work so as to correctly insert into a classifier so be
279#                       it. Of course none of this is done here - instead the
280#                       incremental argument is passed to the document
281#                       processor.
282#
283sub main
284{
285    # command line args
286    my ($verbosity, $archivedir, $cachedir, $builddir, $site, $maxdocs,
287    $debug, $mode, $indexname, $removeold, $keepold,
288    $incremental, $incremental_mode,
289    $remove_empty_classifications,
290    $collectdir, $build, $type, $textindex,
291    $no_strip_html, $no_text, $faillog, $gli, $index, $language,
292    $sections_index_document_metadata, $maxnumeric);
293
294    my $xml = 0;
295    my $hashParsingResult = {};
296    # general options available to all plugins
297    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
298
299    # If parse returns -1 then something has gone wrong
300    if ($intArgLeftinAfterParsing == -1)
301    {
302    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
303    die "\n";
304    }
305   
306    foreach my $strVariable (keys %$hashParsingResult)
307    {
308    eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
309    }
310
311    # If $language has been specified, load the appropriate resource bundle
312    # (Otherwise, the default resource bundle will be loaded automatically)
313    if ($language && $language =~ /\S/) {
314    &gsprintf::load_language_specific_resource_bundle($language);
315    }
316
317    if ($xml) {
318        &PrintUsage::print_xml_usage($options);
319    print "\n";
320    return;
321    }
322
323    if ($gli) { # the gli wants strings to be in UTF-8
324    &gsprintf::output_strings_in_UTF8;
325    }
326
327    # now check that we had exactly one leftover arg, which should be
328    # the collection name. We don't want to do this earlier, cos
329    # -xml arg doesn't need a collection name
330    # Or if the user specified -h, then we output the usage also
331    if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
332    {
333    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
334    die "\n";
335    }
336   
337    $textindex = "";
338    my $close_out = 0;
339    if ($out !~ /^(STDERR|STDOUT)$/i) {
340    open (OUT, ">$out") ||
341        (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
342    $out = "buildcol::OUT";
343    $close_out = 1;
344    }
345    $out->autoflush(1);
346
347    # get and check the collection
348    if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
349    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
350    die "\n";
351    }
352
353    if ($faillog eq "") {
354    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
355    }
356    # note that we're appending to the faillog here (import.pl clears it each time)
357    # this could potentially create a situation where the faillog keeps being added
358    # to over multiple builds (if the import process is being skipped)
359    open (FAILLOG, ">>$faillog") ||
360    (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
361    $faillog = 'buildcol::FAILLOG';
362    $faillog->autoflush(1);
363
364    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
365    # Don't know why this didn't already happen, but now collection specific
366    # classify and plugins directory also added to include path
367    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify");
368    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");
369
370    # Read in the collection configuration file.
371    my ($collectcfg, $buildtype);
372    ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
373    $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
374   
375    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
376    if (!defined($collectcfg->{'infodbtype'}))
377    {
378      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
379    }
380
381    if ($verbosity !~ /\d+/) {
382    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
383        $verbosity = $collectcfg->{'verbosity'};
384    } else {
385        $verbosity = 2; # the default
386    }
387    }
388    # we use searchtype for determining buildtype, but for old versions, use buildtype
389    if (defined $collectcfg->{'buildtype'}) {
390    $buildtype = $collectcfg->{'buildtype'};
391    } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
392    $buildtype = "mgpp";
393    } else {
394    $buildtype = "mg"; #mg is the default
395    }
396    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
397    $archivedir = $collectcfg->{'archivedir'};
398    }
399    if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
400    $cachedir = $collectcfg->{'cachedir'};
401    }
402    if (defined $collectcfg->{'builddir'} && $builddir eq "") {
403    $builddir = $collectcfg->{'builddir'};
404    }
405    if ($maxdocs !~ /\-?\d+/) {
406    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
407        $maxdocs = $collectcfg->{'maxdocs'};
408    } else {
409        $maxdocs = -1; # the default
410    }
411    }
412    if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
413    $maxnumeric = $collectcfg->{'maxnumeric'};
414    }
415   
416    if ($maxnumeric < 4 || $maxnumeric > 512) {
417    $maxnumeric = 4;
418    }
419   
420    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
421    $debug = 1;
422    }
423    if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
424    if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
425        $mode = $collectcfg->{'mode'};
426    } else {
427        $mode = "all"; # the default
428    }
429    }
430    if (defined $collectcfg->{'index'} && $indexname eq "") {
431    $indexname = $collectcfg->{'index'};
432    }
433    if (defined $collectcfg->{'no_text'} && $no_text == 0) {
434    if ($collectcfg->{'no_text'} =~ /^true$/i) {
435        $no_text = 1;
436    }
437    }
438    if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
439    if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
440        $no_strip_html = 1;
441    }
442    }
443    if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
444    if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
445        $remove_empty_classifications = 1;
446    }
447    }
448   
449    if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
450    $textindex = $collectcfg->{'textcompress'};
451    }
452    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
453    $gli = 1;
454    }
455
456    if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
457    $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
458    }
459   
460    if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
461    $sections_index_document_metadata = "never";
462    }
463   
464    ($removeold, $keepold, $incremental, $incremental_mode)
465    = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
466                           $incremental, "building",
467                           $collectcfg);
468 
469    $gli = 0 unless defined $gli;
470   
471    # New argument to track whether build is incremental
472    $incremental = 0 unless defined $incremental;
473
474    print STDERR "<Build>\n" if $gli;
475
476    #set the text index
477    if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
478    if ($textindex eq "") {
479        $textindex = "text";
480    }
481    }
482    else {
483    $textindex = "section:text";
484    }
485
486    # fill in the default archives and building directories if none
487    # were supplied, turn all \ into / and remove trailing /
488
489    my ($realarchivedir, $realbuilddir);
490    # Modified so that the archivedir, if provided as an argument, is made
491    # absolute if it isn't already
492    if ($archivedir eq "")
493      {
494        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
495      }
496    else
497      {
498        $archivedir = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
499      }
500    # End Mod
501    $archivedir =~ s/[\\\/]+/\//g;
502    $archivedir =~ s/\/$//;
503
504    if ($builddir eq "") {
505    $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building");
506    if ($incremental) {
507        &gsprintf($out, "{buildcol.incremental_default_builddir}\n");
508    }
509    }
510    $builddir =~ s/[\\\/]+/\//g;
511    $builddir =~ s/\/$//;
512
513    # update the archive cache if needed
514    if ($cachedir) {
515    &gsprintf($out, "{buildcol.updating_archive_cache}\n")
516        if ($verbosity >= 1);
517
518    $cachedir =~ s/[\\\/]+$//;
519    $cachedir .= "/collect/$collection" unless
520        $cachedir =~ /collect\/$collection/;
521
522    $realarchivedir = "$cachedir/archives";
523    $realbuilddir = "$cachedir/building";
524    &util::mk_all_dir ($realarchivedir);
525    &util::mk_all_dir ($realbuilddir);
526    &util::cachedir ($archivedir, $realarchivedir, $verbosity);
527
528    } else {
529    $realarchivedir = $archivedir;
530    $realbuilddir = $builddir;
531    }
532
533    # build it in realbuilddir
534    &util::mk_all_dir ($realbuilddir);
535
536    my ($buildertype, $builderdir,  $builder);
537    # if a builder class has been created for this collection, use it
538    # otherwise, use the mg or mgpp builder
539    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
540    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
541    $buildertype = "custombuilder";
542    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
543    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
544    $buildertype = "custombuilder";
545    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
546    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
547    $buildertype = "${collection}builder";
548    } else {
549    $builderdir = "$ENV{'GSDLHOME'}/perllib";
550    if ($buildtype eq "lucene") {
551        $buildertype = "lucenebuilder";
552    }
553    elsif ($buildtype eq "mgpp") {
554        $buildertype = "mgppbuilder";
555    }
556    else {
557        $buildertype = "mgbuilder";
558    }
559    }
560   
561    require "$builderdir/$buildertype.pm";
562
563    eval("\$builder = new $buildertype(\$collection, " .
564     "\$realarchivedir, \$realbuilddir, \$verbosity, " .
565     "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
566     "\$remove_empty_classifications, " .
567     "\$out, \$no_text, \$faillog, \$gli)");
568    die "$@" if $@;
569
570    $builder->init();
571    $builder->set_maxnumeric($maxnumeric);
572   
573    if (($buildertype eq "mgppbuilder") && $no_strip_html) {
574    $builder->set_strip_html(0);
575    }
576    if ($sections_index_document_metadata ne "never") {
577    $builder->set_sections_index_document_metadata($sections_index_document_metadata);
578    }
579       
580    if ($mode =~ /^all$/i) {
581    $builder->compress_text($textindex);
582    $builder->build_indexes($indexname);
583    $builder->make_infodatabase();
584    $builder->collect_specific();
585    } elsif ($mode =~ /^compress_text$/i) {
586    $builder->compress_text($textindex);
587    } elsif ($mode =~ /^build_index$/i) {
588    $builder->build_indexes($indexname);   
589    } elsif ($mode =~ /^infodb$/i) {
590    $builder->make_infodatabase();
591    } else {
592    (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
593    }
594
595    $builder->make_auxiliary_files() if !$debug;
596    $builder->deinit();
597   
598    if (($realbuilddir ne $builddir) && !$debug) {
599    &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
600        if ($verbosity >= 1);
601    &util::rm_r ($builddir);
602    &util::cp_r ($realbuilddir, $builddir);
603    }
604
605    close OUT if $close_out;
606    close FAILLOG;
607
608    print STDERR "</Build>\n" if $gli;
609}
610## main() ##
611
612
Note: See TracBrowser for help on using the browser.