root/main/trunk/greenstone2/bin/script/buildcol.pl @ 21822

Revision 21822, 19.4 KB (checked in by ak19, 11 years ago)

Dr Bainbridge has fixed several perl files that depended on perl 5.8 to work and used to fail with Perl 5.10.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3## @file buildcol.pl
4# This program will build a particular collection.
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation; either version 2 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program; if not, write to the Free Software
21# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22#
23# @author New Zealand Digital Library Project unless otherwise stated
24# @copy 1999 New Zealand Digital Library Project
25#
26package buildcol;
27
28BEGIN {
29    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
30    die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
31    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
32    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan/XML/XPath");
34    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36
37    if (defined $ENV{'GSDLEXTS'}) {
38    my @extensions = split(/:/,$ENV{'GSDLEXTS'});
39    foreach my $e (@extensions) {
40        my $ext_prefix = "$ENV{'GSDLHOME'}/ext/$e";
41
42        unshift (@INC, "$ext_prefix/perllib");
43        unshift (@INC, "$ext_prefix/perllib/cpan");
44        unshift (@INC, "$ext_prefix/perllib/plugins");
45        unshift (@INC, "$ext_prefix/perllib/classify");
46    }
47    }
48    if (defined $ENV{'GSDL3EXTS'}) {
49    my @extensions = split(/:/,$ENV{'GSDL3EXTS'});
50    foreach my $e (@extensions) {
51        my $ext_prefix = "$ENV{'GSDL3SRCHOME'}/ext/$e";
52
53        unshift (@INC, "$ext_prefix/perllib");
54        unshift (@INC, "$ext_prefix/perllib/cpan");
55        unshift (@INC, "$ext_prefix/perllib/plugins");
56        unshift (@INC, "$ext_prefix/perllib/classify");
57    }
58    }
59
60}
61
62use colcfg;
63use dbutil;
64use util;
65use scriptutil;
66use FileHandle;
67use gsprintf;
68use printusage;
69use parse2;
70
71use strict;
72no strict 'refs'; # allow filehandles to be variables and vice versa
73no strict 'subs'; # allow barewords (eg STDERR) as function arguments
74
75
76my $mode_list =
77    [ { 'name' => "all",
78        'desc' => "{buildcol.mode.all}" },
79      { 'name' => "compress_text",
80        'desc' => "{buildcol.mode.compress_text}" },
81      { 'name' => "build_index",
82        'desc' => "{buildcol.mode.build_index}" },
83      { 'name' => "infodb",
84        'desc' => "{buildcol.mode.infodb}" } ];
85
86my $sec_index_list =
87    [ {'name' => "never",
88       'desc' => "{buildcol.sections_index_document_metadata.never}" },
89      {'name' => "always",
90       'desc' => "{buildcol.sections_index_document_metadata.always}" },
91      {'name' => "unless_section_metadata_exists",
92       'desc' => "{buildcol.sections_index_document_metadata.unless_section_metadata_exists}" }
93      ];
94
95my $arguments =
96    [ { 'name' => "remove_empty_classifications",
97    'desc' => "{buildcol.remove_empty_classifications}",
98    'type' => "flag",
99    'reqd' => "no",
100    'modegli' => "2" },
101      { 'name' => "archivedir",
102    'desc' => "{buildcol.archivedir}",
103    'type' => "string",
104    'reqd' => "no",
105        'hiddengli' => "yes" },
106      { 'name' => "builddir",
107    'desc' => "{buildcol.builddir}",
108    'type' => "string",
109    'reqd' => "no",
110        'hiddengli' => "yes" },
111#     { 'name' => "cachedir",
112#   'desc' => "{buildcol.cachedir}",
113#   'type' => "string",
114#   'reqd' => "no" },
115      { 'name' => "collectdir",
116    'desc' => "{buildcol.collectdir}",
117    'type' => "string",
118    # parsearg left "" as default
119    #'deft' => &util::filename_cat ($ENV{'GSDLHOME'}, "collect"),
120    'reqd' => "no",
121        'hiddengli' => "yes" },
122      { 'name' => "site",
123    'desc' => "{buildcol.site}",
124    'type' => "string",
125    'deft' => "",
126    'reqd' => "no",
127        'hiddengli' => "yes" },
128      { 'name' => "debug",
129    'desc' => "{buildcol.debug}",
130    'type' => "flag",
131    'reqd' => "no",
132        'hiddengli' => "yes" },
133      { 'name' => "faillog",
134    'desc' => "{buildcol.faillog}",
135    'type' => "string",
136    # parsearg left "" as default
137    #'deft' => &util::filename_cat("<collectdir>", "colname", "etc", "fail.log"),
138    'reqd' => "no",
139    'modegli' => "3" },
140      { 'name' => "index",
141    'desc' => "{buildcol.index}",
142    'type' => "string",
143    'reqd' => "no",
144    'modegli' => "3" },
145      { 'name' => "incremental",
146    'desc' => "{buildcol.incremental}",
147    'type' => "flag",
148    'hiddengli' => "yes" },
149      { 'name' => "keepold",
150    'desc' => "{buildcol.keepold}",
151    'type' => "flag",
152    'reqd' => "no",
153        #'modegli' => "3",
154    'hiddengli' => "yes" },
155      { 'name' => "removeold",
156    'desc' => "{buildcol.removeold}",
157    'type' => "flag",
158    'reqd' => "no",
159    #'modegli' => "3",
160    'hiddengli' => "yes"  },
161      { 'name' => "language",
162    'desc' => "{scripts.language}",
163    'type' => "string",
164    'reqd' => "no",
165    'modegli' => "3" },
166      { 'name' => "maxdocs",
167    'desc' => "{buildcol.maxdocs}",
168    'type' => "int",
169    'reqd' => "no",
170        'hiddengli' => "yes" },
171      { 'name' => "maxnumeric",
172    'desc' => "{buildcol.maxnumeric}",
173    'type' => "int",
174    'reqd' => "no",
175    'deft' => "4",
176    'range' => "4,512",
177    'modegli' => "3" },
178      { 'name' => "mode",
179    'desc' => "{buildcol.mode}",
180    'type' => "enum",
181    'list' => $mode_list,
182    # parsearg left "" as default
183#   'deft' => "all",
184    'reqd' => "no",
185    'modegli' => "3" },
186      { 'name' => "no_strip_html",
187    'desc' => "{buildcol.no_strip_html}",
188    'type' => "flag",
189    'reqd' => "no",
190    'modegli' => "3" },
191      { 'name' => "no_text",
192    'desc' => "{buildcol.no_text}",
193    'type' => "flag",
194    'reqd' => "no",
195    'modegli' => "2" },
196      { 'name' => "sections_index_document_metadata",
197    'desc' => "{buildcol.sections_index_document_metadata}",
198    'type' => "enum",
199    'list' => $sec_index_list,
200    'reqd' => "no",
201    'modegli' => "2" },
202      { 'name' => "out",
203    'desc' => "{buildcol.out}",
204    'type' => "string",
205    'deft' => "STDERR",
206    'reqd' => "no",
207        'hiddengli' => "yes" },
208      { 'name' => "verbosity",
209    'desc' => "{buildcol.verbosity}",
210    'type' => "int",
211    # parsearg left "" as default
212    #'deft' => "2",
213    'reqd' => "no",
214    'modegli' => "3" },
215      { 'name' => "gli",
216    'desc' => "",
217    'type' => "flag",
218    'reqd' => "no",
219    'hiddengli' => "yes" },
220      { 'name' => "xml",
221    'desc' => "{scripts.xml}",
222    'type' => "flag",
223    'reqd' => "no",
224    'hiddengli' => "yes" },
225      ];
226
227my $options = { 'name' => "buildcol.pl",
228        'desc' => "{buildcol.desc}",
229        'args' => $arguments };
230
231
232# globals
233my $collection;
234my $configfilename;
235my $out;
236
237# used to signify "gs2"(default) or "gs3"
238my $gs_mode = "gs2";
239
240## @method gsprintf()
241#  Print a string to the screen after looking it up from a locale dependant
242#  strings file. This function is losely based on the idea of resource
243#  bundles as used in Java.
244#
245#  @param  $error The STDERR stream.
246#  @param  $text The string containing GS keys that should be replaced with
247#                their locale dependant equivilents.
248#  @param  $out The output stream.
249#  @return The locale-based string to output.
250#
251sub gsprintf()
252{
253    return &gsprintf::gsprintf(@_);
254}
255## gsprintf() ##
256
257&main();
258
259## @method main()
260#
261#  [Parses up and validates the arguments to the build process before creating
262#  the appropriate build process to do the actual work - John]
263#
264#  @note Added true incremental support - John Thompson, DL Consulting Ltd.
265#  @note There were several bugs regarding using directories other than
266#        "import" or "archives" during import and build quashed. - John
267#        Thompson, DL Consulting Ltd.
268#
269#  @param  $incremental If true indicates this build should not regenerate all
270#                       the index and metadata files, and should instead just
271#                       append the information found in the archives directory
272#                       to the existing files. If this requires some complex
273#                       work so as to correctly insert into a classifier so be
274#                       it. Of course none of this is done here - instead the
275#                       incremental argument is passed to the document
276#                       processor.
277#
278sub main
279{
280    # command line args
281    my ($verbosity, $archivedir, $cachedir, $builddir, $site, $maxdocs,
282    $debug, $mode, $indexname, $removeold, $keepold,
283    $incremental, $incremental_mode,
284    $remove_empty_classifications,
285    $collectdir, $build, $type, $textindex,
286    $no_strip_html, $no_text, $faillog, $gli, $index, $language,
287    $sections_index_document_metadata, $maxnumeric);
288
289    my $xml = 0;
290    my $hashParsingResult = {};
291    # general options available to all plugins
292    my $intArgLeftinAfterParsing = parse2::parse(\@ARGV,$arguments,$hashParsingResult,"allow_extra_options");
293
294    # If parse returns -1 then something has gone wrong
295    if ($intArgLeftinAfterParsing == -1)
296    {
297    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
298    die "\n";
299    }
300   
301    foreach my $strVariable (keys %$hashParsingResult)
302    {
303    eval "\$$strVariable = \$hashParsingResult->{\"\$strVariable\"}";
304    }
305
306    # If $language has been specified, load the appropriate resource bundle
307    # (Otherwise, the default resource bundle will be loaded automatically)
308    if ($language && $language =~ /\S/) {
309    &gsprintf::load_language_specific_resource_bundle($language);
310    }
311
312    if ($xml) {
313        &PrintUsage::print_xml_usage($options);
314    print "\n";
315    return;
316    }
317
318    if ($gli) { # the gli wants strings to be in UTF-8
319    &gsprintf::output_strings_in_UTF8;
320    }
321
322    # now check that we had exactly one leftover arg, which should be
323    # the collection name. We don't want to do this earlier, cos
324    # -xml arg doesn't need a collection name
325    # Or if the user specified -h, then we output the usage also
326    if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
327    {
328    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
329    die "\n";
330    }
331   
332    $textindex = "";
333    my $close_out = 0;
334    if ($out !~ /^(STDERR|STDOUT)$/i) {
335    open (OUT, ">$out") ||
336        (&gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
337    $out = "buildcol::OUT";
338    $close_out = 1;
339    }
340    $out->autoflush(1);
341
342    # get and check the collection
343    if (($collection = &colcfg::use_collection($site, @ARGV, $collectdir)) eq "") {
344    &PrintUsage::print_txt_usage($options, "{buildcol.params}");
345    die "\n";
346    }
347
348    if ($faillog eq "") {
349    $faillog = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
350    }
351    # note that we're appending to the faillog here (import.pl clears it each time)
352    # this could potentially create a situation where the faillog keeps being added
353    # to over multiple builds (if the import process is being skipped)
354    open (FAILLOG, ">>$faillog") ||
355    (&gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
356    $faillog = 'buildcol::FAILLOG';
357    $faillog->autoflush(1);
358
359    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib");
360    # Don't know why this didn't already happen, but now collection specific
361    # classify and plugins directory also added to include path
362    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/classify");
363    unshift (@INC, "$ENV{'GSDLCOLLECTDIR'}/perllib/plugins");
364
365    # Read in the collection configuration file.
366    my ($collectcfg, $buildtype);
367    ($configfilename, $gs_mode) = &colcfg::get_collect_cfg_name($out);
368    $collectcfg = &colcfg::read_collection_cfg ($configfilename, $gs_mode);
369   
370    # If the infodbtype value wasn't defined in the collect.cfg file, use the default
371    if (!defined($collectcfg->{'infodbtype'}))
372    {
373      $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
374    }
375
376    if ($verbosity !~ /\d+/) {
377    if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/) {
378        $verbosity = $collectcfg->{'verbosity'};
379    } else {
380        $verbosity = 2; # the default
381    }
382    }
383    # we use searchtype for determining buildtype, but for old versions, use buildtype
384    if (defined $collectcfg->{'buildtype'}) {
385    $buildtype = $collectcfg->{'buildtype'};
386    } elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'}) {
387    $buildtype = "mgpp";
388    } else {
389    $buildtype = "mg"; #mg is the default
390    }
391    if (defined $collectcfg->{'archivedir'} && $archivedir eq "") {
392    $archivedir = $collectcfg->{'archivedir'};
393    }
394    if (defined $collectcfg->{'cachedir'} && $cachedir eq "") {
395    $cachedir = $collectcfg->{'cachedir'};
396    }
397    if (defined $collectcfg->{'builddir'} && $builddir eq "") {
398    $builddir = $collectcfg->{'builddir'};
399    }
400    if ($maxdocs !~ /\-?\d+/) {
401    if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/) {
402        $maxdocs = $collectcfg->{'maxdocs'};
403    } else {
404        $maxdocs = -1; # the default
405    }
406    }
407    if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/) {
408    $maxnumeric = $collectcfg->{'maxnumeric'};
409    }
410   
411    if ($maxnumeric < 4 || $maxnumeric > 512) {
412    $maxnumeric = 4;
413    }
414   
415    if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i) {
416    $debug = 1;
417    }
418    if ($mode !~ /^(all|compress_text|build_index|infodb)$/) {
419    if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/) {
420        $mode = $collectcfg->{'mode'};
421    } else {
422        $mode = "all"; # the default
423    }
424    }
425    if (defined $collectcfg->{'index'} && $indexname eq "") {
426    $indexname = $collectcfg->{'index'};
427    }
428    if (defined $collectcfg->{'no_text'} && $no_text == 0) {
429    if ($collectcfg->{'no_text'} =~ /^true$/i) {
430        $no_text = 1;
431    }
432    }
433    if (defined $collectcfg->{'no_strip_html'} && $no_strip_html == 0) {
434    if ($collectcfg->{'no_strip_html'} =~ /^true$/i) {
435        $no_strip_html = 1;
436    }
437    }
438    if (defined $collectcfg->{'remove_empty_classifications'} && $remove_empty_classifications == 0) {
439    if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i) {
440        $remove_empty_classifications = 1;
441    }
442    }
443   
444    if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
445    $textindex = $collectcfg->{'textcompress'};
446    }
447    if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i) {
448    $gli = 1;
449    }
450
451    if ($sections_index_document_metadata !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'}) {
452    $sections_index_document_metadata = $collectcfg->{'sections_index_document_metadata'};
453    }
454   
455    if ($sections_index_document_metadata !~ /^(never|always|unless_section_metadata_exists)$/) {
456    $sections_index_document_metadata = "never";
457    }
458   
459    ($removeold, $keepold, $incremental, $incremental_mode)
460    = &scriptutil::check_removeold_and_keepold($removeold, $keepold,
461                           $incremental, "building",
462                           $collectcfg);
463 
464    $gli = 0 unless defined $gli;
465   
466    # New argument to track whether build is incremental
467    $incremental = 0 unless defined $incremental;
468
469    print STDERR "<Build>\n" if $gli;
470
471    #set the text index
472    if (($buildtype eq "mgpp") || ($buildtype eq "lucene")) {
473    if ($textindex eq "") {
474        $textindex = "text";
475    }
476    }
477    else {
478    $textindex = "section:text";
479    }
480
481    # fill in the default archives and building directories if none
482    # were supplied, turn all \ into / and remove trailing /
483
484    my ($realarchivedir, $realbuilddir);
485    # Modified so that the archivedir, if provided as an argument, is made
486    # absolute if it isn't already
487    if ($archivedir eq "")
488      {
489        $archivedir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "archives");
490      }
491    else
492      {
493        $archivedir = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
494      }
495    # End Mod
496    $archivedir =~ s/[\\\/]+/\//g;
497    $archivedir =~ s/\/$//;
498
499    if ($builddir eq "") {
500    $builddir = &util::filename_cat ($ENV{'GSDLCOLLECTDIR'}, "building");
501    if ($incremental) {
502        &gsprintf($out, "{buildcol.incremental_default_builddir}\n");
503    }
504    }
505    $builddir =~ s/[\\\/]+/\//g;
506    $builddir =~ s/\/$//;
507
508    # update the archive cache if needed
509    if ($cachedir) {
510    &gsprintf($out, "{buildcol.updating_archive_cache}\n")
511        if ($verbosity >= 1);
512
513    $cachedir =~ s/[\\\/]+$//;
514    $cachedir .= "/collect/$collection" unless
515        $cachedir =~ /collect\/$collection/;
516
517    $realarchivedir = "$cachedir/archives";
518    $realbuilddir = "$cachedir/building";
519    &util::mk_all_dir ($realarchivedir);
520    &util::mk_all_dir ($realbuilddir);
521    &util::cachedir ($archivedir, $realarchivedir, $verbosity);
522
523    } else {
524    $realarchivedir = $archivedir;
525    $realbuilddir = $builddir;
526    }
527
528    # build it in realbuilddir
529    &util::mk_all_dir ($realbuilddir);
530
531    my ($buildertype, $builderdir,  $builder);
532    # if a builder class has been created for this collection, use it
533    # otherwise, use the mg or mgpp builder
534    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm") {
535    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
536    $buildertype = "custombuilder";
537    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm") {
538    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
539    $buildertype = "custombuilder";
540    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm") {
541    $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
542    $buildertype = "${collection}builder";
543    } else {
544    $builderdir = "$ENV{'GSDLHOME'}/perllib";
545    if ($buildtype eq "lucene") {
546        $buildertype = "lucenebuilder";
547    }
548    elsif ($buildtype eq "mgpp") {
549        $buildertype = "mgppbuilder";
550    }
551    else {
552        $buildertype = "mgbuilder";
553    }
554    }
555   
556    require "$builderdir/$buildertype.pm";
557
558    eval("\$builder = new $buildertype(\$collection, " .
559     "\$realarchivedir, \$realbuilddir, \$verbosity, " .
560     "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
561     "\$remove_empty_classifications, " .
562     "\$out, \$no_text, \$faillog, \$gli)");
563    die "$@" if $@;
564
565    $builder->init();
566    $builder->set_maxnumeric($maxnumeric);
567   
568    if (($buildertype eq "mgppbuilder") && $no_strip_html) {
569    $builder->set_strip_html(0);
570    }
571    if ($sections_index_document_metadata ne "never") {
572    $builder->set_sections_index_document_metadata($sections_index_document_metadata);
573    }
574       
575    if ($mode =~ /^all$/i) {
576    $builder->compress_text($textindex);
577    $builder->build_indexes($indexname);
578    $builder->make_infodatabase();
579    $builder->collect_specific();
580    } elsif ($mode =~ /^compress_text$/i) {
581    $builder->compress_text($textindex);
582    } elsif ($mode =~ /^build_index$/i) {
583    $builder->build_indexes($indexname);   
584    } elsif ($mode =~ /^infodb$/i) {
585    $builder->make_infodatabase();
586    } else {
587    (&gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
588    }
589
590    $builder->make_auxiliary_files() if !$debug;
591    $builder->deinit();
592   
593    if (($realbuilddir ne $builddir) && !$debug) {
594    &gsprintf($out, "{buildcol.copying_back_cached_build}\n")
595        if ($verbosity >= 1);
596    &util::rm_r ($builddir);
597    &util::cp_r ($realbuilddir, $builddir);
598    }
599
600    close OUT if $close_out;
601    close FAILLOG;
602
603    print STDERR "</Build>\n" if $gli;
604}
605## main() ##
606
607
Note: See TracBrowser for help on using the browser.