[27304] | 1 | ###############################################################################
|
---|
| 2 | #
|
---|
| 3 | # buildcolutils.pm -- index and build the collection. The buildtime counterpart
|
---|
| 4 | # of inexport.pl
|
---|
| 5 | #
|
---|
| 6 | # A component of the Greenstone digital library software
|
---|
| 7 | # from the New Zealand Digital Library Project at the
|
---|
| 8 | # University of Waikato, New Zealand.
|
---|
| 9 | #
|
---|
| 10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | # You should have received a copy of the GNU General Public License
|
---|
| 23 | # along with this program; if not, write to the Free Software
|
---|
| 24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | #
|
---|
| 26 | ###############################################################################
|
---|
| 27 |
|
---|
| 28 | package buildcolutils;
|
---|
| 29 |
|
---|
[28801] | 30 | #use strict;
|
---|
| 31 | #no strict 'refs';
|
---|
| 32 |
|
---|
[28087] | 33 | use File::Basename;
|
---|
| 34 |
|
---|
[27304] | 35 | use colcfg;
|
---|
| 36 | use dbutil;
|
---|
| 37 | use util;
|
---|
| 38 | use FileUtils;
|
---|
| 39 | use scriptutil;
|
---|
| 40 | use gsprintf;
|
---|
| 41 | use printusage;
|
---|
| 42 | use parse2;
|
---|
| 43 |
|
---|
| 44 | ## @method new()
|
---|
| 45 | #
|
---|
| 46 | # Parses up and validates the arguments to the build process before creating
|
---|
| 47 | # the appropriate build process to do the actual work
|
---|
| 48 | #
|
---|
| 49 | # @note Added true incremental support - John Thompson, DL Consulting Ltd.
|
---|
| 50 | # @note There were several bugs regarding using directories other than
|
---|
| 51 | # "import" or "archives" during import and build quashed. - John
|
---|
| 52 | # Thompson, DL Consulting Ltd.
|
---|
| 53 | #
|
---|
| 54 | # @param $incremental If true indicates this build should not regenerate all
|
---|
| 55 | # the index and metadata files, and should instead just
|
---|
| 56 | # append the information found in the archives directory
|
---|
| 57 | # to the existing files. If this requires some complex
|
---|
| 58 | # work so as to correctly insert into a classifier so be
|
---|
| 59 | # it. Of course none of this is done here - instead the
|
---|
| 60 | # incremental argument is passed to the document
|
---|
| 61 | # processor.
|
---|
| 62 | #
|
---|
| 63 | sub new
|
---|
| 64 | {
|
---|
| 65 | my $class = shift(@_);
|
---|
| 66 | my ($argv, $options, $opt_listall_options) = @_;
|
---|
| 67 |
|
---|
| 68 | my $self = {'builddir' => undef,
|
---|
| 69 | 'buildtype' => undef,
|
---|
| 70 | 'close_faillog' => 0,
|
---|
| 71 | 'close_out' => 0,
|
---|
| 72 | 'mode' => '',
|
---|
| 73 | 'orthogonalbuildtypes' => undef,
|
---|
| 74 | 'realbuilddir' => undef,
|
---|
| 75 | 'textindex' => '',
|
---|
| 76 | 'xml' => 0
|
---|
| 77 | };
|
---|
| 78 |
|
---|
| 79 | # general options available to all plugins
|
---|
| 80 | my $arguments = $options->{'args'};
|
---|
| 81 | my $intArgLeftinAfterParsing = &parse2::parse($argv, $arguments, $self, "allow_extra_options");
|
---|
| 82 | # If parse returns -1 then something has gone wrong
|
---|
| 83 | if ($intArgLeftinAfterParsing == -1)
|
---|
| 84 | {
|
---|
[29096] | 85 | &PrintUsage::print_txt_usage($options, "{buildcol.params}",1);
|
---|
| 86 | print STDERR "Something went wrong during parsing the arguments. Scroll up for details.\n";
|
---|
[27304] | 87 | die "\n";
|
---|
| 88 | }
|
---|
| 89 |
|
---|
| 90 | # If $language has been specified, load the appropriate resource bundle
|
---|
| 91 | # (Otherwise, the default resource bundle will be loaded automatically)
|
---|
| 92 | if ($self->{'language'} && $self->{'language'} =~ /\S/)
|
---|
| 93 | {
|
---|
| 94 | &gsprintf::load_language_specific_resource_bundle($self->{'language'});
|
---|
| 95 | }
|
---|
| 96 |
|
---|
| 97 | # Do we need 'listall' support in buildcol? If so, copy code from inexport
|
---|
| 98 | # later [jmt12]
|
---|
| 99 |
|
---|
| 100 | # <insert explanation here>
|
---|
| 101 | if ($self->{'xml'})
|
---|
| 102 | {
|
---|
| 103 | &PrintUsage::print_xml_usage($options);
|
---|
| 104 | print "\n";
|
---|
| 105 | return bless($self, $class);
|
---|
| 106 | }
|
---|
| 107 |
|
---|
| 108 | # the gli wants strings to be in UTF-8
|
---|
| 109 | if ($gli)
|
---|
| 110 | {
|
---|
| 111 | &gsprintf::output_strings_in_UTF8;
|
---|
| 112 | }
|
---|
[29096] | 113 |
|
---|
| 114 | # If the user specified -h, then we output the usage
|
---|
| 115 | if (@$argv && $argv->[0] =~ /^\-+h/) {
|
---|
| 116 | &PrintUsage::print_txt_usage($options, "{buildcol.params}");
|
---|
| 117 | die "\n";
|
---|
| 118 | }
|
---|
| 119 |
|
---|
[27304] | 120 | # now check that we had exactly one leftover arg, which should be
|
---|
| 121 | # the collection name. We don't want to do this earlier, cos
|
---|
| 122 | # -xml arg doesn't need a collection name
|
---|
[29096] | 123 | if ($intArgLeftinAfterParsing != 1)
|
---|
[27304] | 124 | {
|
---|
[29096] | 125 | &PrintUsage::print_txt_usage($options, "{buildcol.params}", 1);
|
---|
| 126 | print STDERR "There should be one argument left after parsing the script args: the collection name.\n";
|
---|
[27304] | 127 | die "\n";
|
---|
| 128 | }
|
---|
| 129 |
|
---|
| 130 | my $out = $self->{'out'};
|
---|
| 131 | if ($out !~ /^(STDERR|STDOUT)$/i)
|
---|
| 132 | {
|
---|
| 133 | open (OUT, ">$out") || (&gsprintf::gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
|
---|
| 134 | $out = "buildcolutils::OUT";
|
---|
| 135 | $self->{'close_out'} = 1;
|
---|
| 136 | }
|
---|
| 137 | $out->autoflush(1);
|
---|
| 138 | $self->{'out'} = $out;
|
---|
| 139 |
|
---|
| 140 | # @ARGV should be only one item, the name of the collection
|
---|
| 141 | $self->{'collection'} = shift(@{$argv});
|
---|
| 142 |
|
---|
| 143 | return bless($self, $class);
|
---|
| 144 | }
|
---|
| 145 | # new()
|
---|
| 146 |
|
---|
| 147 | # newCGI()?
|
---|
| 148 |
|
---|
| 149 | # @function get_collection
|
---|
| 150 | #
|
---|
| 151 | sub get_collection
|
---|
| 152 | {
|
---|
| 153 | my $self = shift @_;
|
---|
| 154 | return $self->{'collection'};
|
---|
| 155 | }
|
---|
| 156 | # get_collection()
|
---|
| 157 |
|
---|
| 158 | # @function read_collection_cfg
|
---|
| 159 | #
|
---|
| 160 | sub read_collection_cfg
|
---|
| 161 | {
|
---|
| 162 | my $self = shift(@_);
|
---|
| 163 | my ($collection, $options) = @_;
|
---|
| 164 |
|
---|
| 165 | my $collectdir = $self->{'collectdir'};
|
---|
| 166 | my $site = $self->{'site'};
|
---|
| 167 | my $out = $self->{'out'};
|
---|
| 168 |
|
---|
| 169 | # get and check the collection
|
---|
| 170 | if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "")
|
---|
| 171 | {
|
---|
[29096] | 172 | #&PrintUsage::print_txt_usage($options, "{buildcol.params}", 1);
|
---|
[27304] | 173 | die "\n";
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 | # set gs_version 2/3
|
---|
| 177 | $self->{'gs_version'} = "2";
|
---|
| 178 | if ((defined $site) && ($site ne ""))
|
---|
| 179 | {
|
---|
| 180 | # gs3
|
---|
| 181 | $self->{'gs_version'} = "3";
|
---|
| 182 | }
|
---|
| 183 |
|
---|
| 184 | # add collection's perllib dir into include path in case we have collection
|
---|
| 185 | # specific modules
|
---|
| 186 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
|
---|
| 187 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify'));
|
---|
| 188 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'plugins'));
|
---|
| 189 |
|
---|
| 190 | # check that we can open the faillog
|
---|
| 191 | my $faillog = $self->{'faillog'};
|
---|
| 192 | if ($faillog eq "")
|
---|
| 193 | {
|
---|
| 194 | $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
|
---|
| 195 | }
|
---|
| 196 | # note that we're appending to the faillog here (import.pl clears it each time)
|
---|
| 197 | # this could potentially create a situation where the faillog keeps being added
|
---|
| 198 | # to over multiple builds (if the import process is being skipped)
|
---|
| 199 | open (FAILLOG, ">>$faillog") || (&gsprintf::gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
|
---|
| 200 | $faillog = 'buildcolutils::FAILLOG';
|
---|
| 201 | $faillog->autoflush(1);
|
---|
| 202 | $self->{'faillog'} = $faillog;
|
---|
| 203 | $self->{'faillogname'} = $faillog;
|
---|
| 204 | $self->{'close_faillog'} = 1;
|
---|
| 205 |
|
---|
| 206 | # Read in the collection configuration file.
|
---|
| 207 | my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
|
---|
| 208 | my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
|
---|
| 209 | my $collect_cfg = &colcfg::read_collection_cfg($config_filename, $gs_mode);
|
---|
| 210 |
|
---|
| 211 | return ($config_filename, $collect_cfg);
|
---|
| 212 | }
|
---|
| 213 | # read_collection_cfg()
|
---|
| 214 |
|
---|
| 215 | # @function set_collection_options
|
---|
| 216 | # This function copies across values for arguments from the collection
|
---|
| 217 | # configuration file if they are not already provided by the user, then
|
---|
| 218 | # sets reasonable defaults for any required arguments that remains without
|
---|
| 219 | # a value.
|
---|
| 220 | sub set_collection_options
|
---|
| 221 | {
|
---|
| 222 | my $self = shift @_;
|
---|
| 223 | my ($collectcfg) = @_;
|
---|
| 224 | my ($buildtype, $orthogonalbuildtypes);
|
---|
| 225 |
|
---|
| 226 | # If the infodbtype value wasn't defined in the collect.cfg file, use the default
|
---|
| 227 | if (!defined($collectcfg->{'infodbtype'}))
|
---|
| 228 | {
|
---|
| 229 | $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
|
---|
| 230 | }
|
---|
| 231 | # - just so I don't have to pass collectcfg around as well
|
---|
| 232 | $self->{'infodbtype'} = $collectcfg->{'infodbtype'};
|
---|
| 233 |
|
---|
| 234 | if ($self->{'verbosity'} !~ /\d+/)
|
---|
| 235 | {
|
---|
| 236 | if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/)
|
---|
| 237 | {
|
---|
| 238 | $self->{'verbosity'} = $collectcfg->{'verbosity'};
|
---|
| 239 | }
|
---|
| 240 | else
|
---|
| 241 | {
|
---|
| 242 | $self->{'verbosity'} = 2; # the default
|
---|
| 243 | }
|
---|
| 244 | }
|
---|
| 245 |
|
---|
| 246 | # we use searchtype for determining buildtype, but for old versions, use buildtype
|
---|
| 247 | if (defined $collectcfg->{'buildtype'})
|
---|
| 248 | {
|
---|
| 249 | $self->{'buildtype'} = $collectcfg->{'buildtype'};
|
---|
| 250 | }
|
---|
| 251 | elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'})
|
---|
| 252 | {
|
---|
| 253 | $self->{'buildtype'} = "mgpp";
|
---|
| 254 | }
|
---|
| 255 | else
|
---|
| 256 | {
|
---|
| 257 | $self->{'buildtype'} = "mg"; #mg is the default
|
---|
| 258 | }
|
---|
| 259 |
|
---|
| 260 | if ($self->{'buildtype'} eq "mgpp" && defined $collectcfg->{'textcompress'})
|
---|
| 261 | {
|
---|
| 262 | $self->{'textindex'} = $collectcfg->{'textcompress'};
|
---|
| 263 | }
|
---|
| 264 |
|
---|
| 265 | # is it okay to always clobber or possible remain undefined? [jmt12]
|
---|
| 266 | if (defined $collectcfg->{'orthogonalbuildtypes'})
|
---|
| 267 | {
|
---|
| 268 | $self->{'orthogonalbuildtypes'} = $collectcfg->{'orthogonalbuildtypes'};
|
---|
| 269 | }
|
---|
| 270 |
|
---|
[27392] | 271 | # - resolve (and possibly set to default) builddir
|
---|
[27304] | 272 | if (defined $collectcfg->{'archivedir'} && $self->{'archivedir'} eq "")
|
---|
| 273 | {
|
---|
| 274 | $self->{'archivedir'} = $collectcfg->{'archivedir'};
|
---|
| 275 | }
|
---|
[27392] | 276 | # Modified so that the archivedir, if provided as an argument, is made
|
---|
| 277 | # absolute if it isn't already
|
---|
| 278 | if ($self->{'archivedir'} eq "")
|
---|
[27304] | 279 | {
|
---|
[27392] | 280 | $self->{'archivedir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
|
---|
[27304] | 281 | }
|
---|
[27392] | 282 | else
|
---|
| 283 | {
|
---|
[29078] | 284 | $self->{'archivedir'} = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $self->{'archivedir'});
|
---|
[27392] | 285 | }
|
---|
| 286 | # End Mod
|
---|
[27482] | 287 | $self->{'archivedir'} = &FileUtils::sanitizePath($self->{'archivedir'});
|
---|
| 288 | #$self->{'archivedir'} =~ s/[\\\/]+/\//g;
|
---|
| 289 | #$self->{'archivedir'} =~ s/\/$//;
|
---|
[27304] | 290 |
|
---|
[27392] | 291 | # - resolve (and possibly set to default) builddir
|
---|
[27304] | 292 | if (defined $collectcfg->{'builddir'} && $self->{'builddir'} eq "")
|
---|
| 293 | {
|
---|
| 294 | $self->{'builddir'} = $collectcfg->{'builddir'};
|
---|
| 295 | }
|
---|
[27392] | 296 | if ($self->{'builddir'} eq "")
|
---|
| 297 | {
|
---|
| 298 | $self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'building');
|
---|
| 299 | if ($incremental)
|
---|
| 300 | {
|
---|
| 301 | &gsprintf::gsprintf($out, "{buildcol.incremental_default_builddir}\n");
|
---|
| 302 | }
|
---|
[29078] | 303 | } else {
|
---|
| 304 | # make absolute if not already
|
---|
| 305 | $self->{'builddir'} = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $self->{'builddir'});
|
---|
[27392] | 306 | }
|
---|
[29078] | 307 |
|
---|
[27482] | 308 | $self->{'builddir'} = &FileUtils::sanitizePath($self->{'builddir'});
|
---|
| 309 | #$self->{'builddir'} =~ s/[\\\/]+/\//g;
|
---|
| 310 | #$self->{'builddir'} =~ s/\/$//;
|
---|
[27304] | 311 |
|
---|
[27392] | 312 | if (defined $collectcfg->{'cachedir'} && $self->{'cachedir'} eq "")
|
---|
| 313 | {
|
---|
| 314 | $self->{'cachedir'} = $collectcfg->{'cachedir'};
|
---|
| 315 | }
|
---|
| 316 |
|
---|
[27304] | 317 | if ($self->{'maxdocs'} !~ /\-?\d+/)
|
---|
| 318 | {
|
---|
| 319 | if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/)
|
---|
| 320 | {
|
---|
| 321 | $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
|
---|
| 322 | }
|
---|
| 323 | else
|
---|
| 324 | {
|
---|
| 325 | $self->{'maxdocs'} = -1; # the default
|
---|
| 326 | }
|
---|
| 327 | }
|
---|
| 328 |
|
---|
| 329 | # always clobbers? [jmt12]
|
---|
| 330 | if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/)
|
---|
| 331 | {
|
---|
| 332 | $self->{'maxnumeric'} = $collectcfg->{'maxnumeric'};
|
---|
| 333 | }
|
---|
| 334 | if ($self->{'maxnumeric'} < 4 || $self->{'maxnumeric'} > 512)
|
---|
| 335 | {
|
---|
| 336 | $self->{'maxnumeric'} = 4;
|
---|
| 337 | }
|
---|
| 338 |
|
---|
| 339 | if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i)
|
---|
| 340 | {
|
---|
| 341 | $self->{'debug'} = 1;
|
---|
| 342 | }
|
---|
| 343 |
|
---|
[28801] | 344 | if ($self->{'mode'} !~ /^(all|compress_text|build_index|infodb|extra)$/)
|
---|
[27304] | 345 | {
|
---|
[28801] | 346 | if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb|extra)$/)
|
---|
[27304] | 347 | {
|
---|
| 348 | $self->{'mode'} = $collectcfg->{'mode'};
|
---|
| 349 | }
|
---|
| 350 | else
|
---|
| 351 | {
|
---|
| 352 | $self->{'mode'} = "all"; # the default
|
---|
| 353 | }
|
---|
| 354 | }
|
---|
| 355 |
|
---|
| 356 | # Presumably 'index' from the collect.cfg still works [jmt12]
|
---|
| 357 | if (defined $collectcfg->{'index'} && $self->{'indexname'} eq "")
|
---|
| 358 | {
|
---|
| 359 | $self->{'indexname'} = $collectcfg->{'index'};
|
---|
| 360 | }
|
---|
| 361 | # - 'index' from the command line doesn't make it through parsing so I
|
---|
| 362 | # renamed this option 'indexname' [jmt12]
|
---|
| 363 | if (defined $collectcfg->{'indexname'} && $self->{'indexname'} eq "")
|
---|
| 364 | {
|
---|
| 365 | $self->{'indexname'} = $collectcfg->{'indexname'};
|
---|
| 366 | }
|
---|
| 367 | # - we may also define the index level to build now [jmt12]
|
---|
| 368 | if (defined $collectcfg->{'indexlevel'} && $self->{'indexlevel'} eq "")
|
---|
| 369 | {
|
---|
| 370 | $self->{'indexlevel'} = $collectcfg->{'indexlevel'};
|
---|
| 371 | }
|
---|
| 372 |
|
---|
| 373 | if (defined $collectcfg->{'no_text'} && $self->{'no_text'} == 0)
|
---|
| 374 | {
|
---|
| 375 | if ($collectcfg->{'no_text'} =~ /^true$/i)
|
---|
| 376 | {
|
---|
| 377 | $self->{'no_text'} = 1;
|
---|
| 378 | }
|
---|
| 379 | }
|
---|
| 380 |
|
---|
| 381 | if (defined $collectcfg->{'no_strip_html'} && $self->{'no_strip_html'} == 0)
|
---|
| 382 | {
|
---|
| 383 | if ($collectcfg->{'no_strip_html'} =~ /^true$/i)
|
---|
| 384 | {
|
---|
| 385 | $self->{'no_strip_html'} = 1;
|
---|
| 386 | }
|
---|
| 387 | }
|
---|
| 388 |
|
---|
| 389 | if (defined $collectcfg->{'store_metadata_coverage'} && $self->{'store_metadata_coverage'} == 0)
|
---|
| 390 | {
|
---|
| 391 | if ($collectcfg->{'store_metadata_coverage'} =~ /^true$/i)
|
---|
| 392 | {
|
---|
| 393 | $self->{'store_metadata_coverage'} = 1;
|
---|
| 394 | }
|
---|
| 395 | }
|
---|
| 396 |
|
---|
| 397 | if (defined $collectcfg->{'remove_empty_classifications'} && $self->{'remove_empty_classifications'} == 0)
|
---|
| 398 | {
|
---|
| 399 | if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i)
|
---|
| 400 | {
|
---|
| 401 | $self->{'remove_empty_classifications'} = 1;
|
---|
| 402 | }
|
---|
| 403 | }
|
---|
| 404 |
|
---|
| 405 | if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i)
|
---|
| 406 | {
|
---|
| 407 | $self->{'gli'} = 1;
|
---|
| 408 | }
|
---|
| 409 | if (!defined $self->{'gli'})
|
---|
| 410 | {
|
---|
| 411 | $self->{'gli'} = 0;
|
---|
| 412 | }
|
---|
| 413 |
|
---|
| 414 | if ($self->{'sections_index_document_metadata'} !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'})
|
---|
| 415 | {
|
---|
| 416 | $self->{'sections_index_document_metadata'} = $collectcfg->{'sections_index_document_metadata'};
|
---|
| 417 | }
|
---|
| 418 |
|
---|
| 419 | if ($self->{'sections_index_document_metadata'} !~ /^(never|always|unless_section_metadata_exists)$/) {
|
---|
| 420 | $self->{'sections_index_document_metadata'} = 'never';
|
---|
| 421 | }
|
---|
| 422 |
|
---|
[27563] | 423 | if ($self->{'sections_sort_on_document_metadata'} !~ /\S/ && defined $collectcfg->{'sections_sort_on_document_metadata'})
|
---|
| 424 | {
|
---|
| 425 | $self->{'sections_sort_on_document_metadata'} = $collectcfg->{'sections_sort_on_document_metadata'};
|
---|
| 426 | }
|
---|
| 427 |
|
---|
| 428 | if ($self->{'sections_sort_on_document_metadata'} !~ /^(never|always|unless_section_metadata_exists)$/) {
|
---|
| 429 | $self->{'sections_sort_on_document_metadata'} = 'never';
|
---|
| 430 | }
|
---|
| 431 |
|
---|
[27304] | 432 | my ($removeold, $keepold, $incremental, $incremental_mode)
|
---|
| 433 | = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
|
---|
| 434 | $self->{'incremental'}, 'building',
|
---|
| 435 | $collectcfg);
|
---|
| 436 | $self->{'removeold'} = $removeold;
|
---|
| 437 | $self->{'keepold'} = $keepold;
|
---|
| 438 | $self->{'incremental'} = $incremental;
|
---|
| 439 | $self->{'incremental_mode'} = $incremental_mode;
|
---|
| 440 |
|
---|
| 441 | # New argument to track whether build is incremental
|
---|
| 442 | if (!defined $self->{'incremental'})
|
---|
| 443 | {
|
---|
| 444 | $self->{'incremental'} = 0;
|
---|
| 445 | }
|
---|
| 446 |
|
---|
| 447 | #set the text index
|
---|
| 448 | if (($self->{'buildtype'} eq 'mgpp') || ($self->{'buildtype'} eq 'lucene') || ($self->{'buildtype'} eq 'solr'))
|
---|
| 449 | {
|
---|
| 450 | if ($self->{'textindex'} eq '')
|
---|
| 451 | {
|
---|
| 452 | $self->{'textindex'} = 'text';
|
---|
| 453 | }
|
---|
| 454 | }
|
---|
| 455 | else
|
---|
| 456 | {
|
---|
| 457 | $self->{'textindex'} = 'section:text';
|
---|
| 458 | }
|
---|
| 459 | }
|
---|
| 460 | # set_collection_options()
|
---|
| 461 |
|
---|
| 462 | # @function prepare_builders
|
---|
| 463 | #
|
---|
| 464 | sub prepare_builders
|
---|
| 465 | {
|
---|
| 466 | my $self = shift @_;
|
---|
| 467 | my ($config_filename,$collectcfg) = @_;
|
---|
| 468 |
|
---|
| 469 | my $archivedir = $self->{'archivedir'};
|
---|
| 470 | my $builddir = $self->{'builddir'};
|
---|
| 471 | my $buildtype = $self->{'buildtype'};
|
---|
| 472 | my $cachedir = $self->{'cachedir'};
|
---|
| 473 | my $collectdir = $self->{'collectdir'};
|
---|
| 474 | my $collection = $self->{'collection'};
|
---|
| 475 | my $debug = $self->{'debug'};
|
---|
| 476 | my $faillog = $self->{'faillog'};
|
---|
| 477 | my $gli = $self->{'gli'};
|
---|
| 478 | my $incremental = $self->{'incremental'};
|
---|
| 479 | my $incremental_mode = $self->{'incremental_mode'};
|
---|
| 480 | my $keepold = $self->{'keepold'};
|
---|
| 481 | my $maxdocs = $self->{'maxdocs'};
|
---|
| 482 | my $maxnumeric = $self->{'maxnumeric'};
|
---|
| 483 | my $no_strip_html = $self->{'no_strip_html'};
|
---|
| 484 | my $no_text = $self->{'no_text'};
|
---|
| 485 | my $orthogonalbuildtypes = $self->{'orthogonalbuildtypes'};
|
---|
| 486 | my $out = $self->{'out'};
|
---|
| 487 | my $remove_empty_classifications = $self->{'remove_empty_classifications'};
|
---|
| 488 | my $sections_index_document_metadata = $self->{'sections_index_document_metadata'};
|
---|
[27563] | 489 | my $sections_sort_on_document_metadata = $self->{'sections_sort_on_document_metadata'};
|
---|
[27304] | 490 | my $site = $self->{'site'};
|
---|
| 491 | my $store_metadata_coverage = $self->{'store_metadata_coverage'};
|
---|
| 492 | my $verbosity = $self->{'verbosity'};
|
---|
| 493 |
|
---|
| 494 | if ($gli)
|
---|
| 495 | {
|
---|
| 496 | print STDERR "<Build>\n";
|
---|
| 497 | }
|
---|
| 498 |
|
---|
| 499 | # fill in the default archives and building directories if none
|
---|
| 500 | # were supplied, turn all \ into / and remove trailing /
|
---|
| 501 |
|
---|
| 502 | my ($realarchivedir, $realbuilddir);
|
---|
| 503 | # update the archive cache if needed
|
---|
| 504 | if ($cachedir)
|
---|
| 505 | {
|
---|
| 506 | if ($verbosity >= 1)
|
---|
| 507 | {
|
---|
| 508 | &gsprintf::gsprintf($out, "{buildcol.updating_archive_cache}\n")
|
---|
| 509 | }
|
---|
| 510 |
|
---|
| 511 | $cachedir =~ s/[\\\/]+$//;
|
---|
| 512 | if ($cachedir !~ /collect[\/\\]$collection/)
|
---|
| 513 | {
|
---|
| 514 | $cachedir = &FileUtils::filenameConcatenate($cachedir, 'collect', $collection);
|
---|
| 515 | }
|
---|
| 516 |
|
---|
| 517 | $realarchivedir = &FileUtils::filenameConcatenate($cachedir, 'archives');
|
---|
| 518 | $realbuilddir = &FileUtils::filenameConcatenate($cachedir, 'building');
|
---|
| 519 | &FileUtils::makeAllDirectories($realarchivedir);
|
---|
| 520 | &FileUtils::makeAllDirectories($realbuilddir);
|
---|
[28566] | 521 | &FileUtils::synchronizeDirectory($archivedir, $realarchivedir, $verbosity);
|
---|
[27304] | 522 | }
|
---|
| 523 | else
|
---|
| 524 | {
|
---|
| 525 | $realarchivedir = $archivedir;
|
---|
| 526 | $realbuilddir = $builddir;
|
---|
| 527 | }
|
---|
[27392] | 528 | $self->{'realarchivedir'} = $realarchivedir;
|
---|
[27304] | 529 | $self->{'realbuilddir'} = $realbuilddir;
|
---|
| 530 |
|
---|
| 531 | # build it in realbuilddir
|
---|
| 532 | &FileUtils::makeAllDirectories($realbuilddir);
|
---|
| 533 |
|
---|
| 534 | my ($buildertype, $builderdir, $builder);
|
---|
| 535 | # if a builder class has been created for this collection, use it
|
---|
| 536 | # otherwise, use the mg or mgpp builder
|
---|
| 537 | if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm")
|
---|
| 538 | {
|
---|
| 539 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
|
---|
| 540 | $buildertype = "custombuilder";
|
---|
| 541 | }
|
---|
| 542 | elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm")
|
---|
| 543 | {
|
---|
| 544 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
|
---|
| 545 | $buildertype = "custombuilder";
|
---|
| 546 | }
|
---|
| 547 | elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm")
|
---|
| 548 | {
|
---|
| 549 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
|
---|
| 550 | $buildertype = $collection . 'builder';
|
---|
| 551 | }
|
---|
| 552 | else
|
---|
| 553 | {
|
---|
| 554 | $builderdir = undef;
|
---|
| 555 | if ($buildtype ne '')
|
---|
| 556 | {
|
---|
| 557 | # caters for extension-based build types, such as 'solr'
|
---|
| 558 | $buildertype = $buildtype . 'builder';
|
---|
| 559 | }
|
---|
| 560 | else
|
---|
| 561 | {
|
---|
| 562 | # Default to mgpp
|
---|
| 563 | $buildertype = 'mgppbuilder';
|
---|
| 564 | }
|
---|
| 565 | }
|
---|
| 566 | # check for extension specific builders
|
---|
| 567 | # (that will then be run after main builder.pm
|
---|
| 568 | my @builderdir_list = ($builderdir);
|
---|
| 569 | my @buildertype_list = ($buildertype);
|
---|
| 570 |
|
---|
[28801] | 571 | my $mode = $self->{'mode'};
|
---|
| 572 |
|
---|
| 573 | if ($mode eq "extra") {
|
---|
| 574 | # knock out the main builder type, by reseting the lists to be empty
|
---|
| 575 | @builderdir_list = ();
|
---|
| 576 | @buildertype_list = ();
|
---|
| 577 | }
|
---|
| 578 |
|
---|
[27304] | 579 | if (defined $orthogonalbuildtypes)
|
---|
| 580 | {
|
---|
| 581 | foreach my $obt (@$orthogonalbuildtypes)
|
---|
| 582 | {
|
---|
| 583 | push(@builderdir_list,undef); # rely on @INC to find it
|
---|
| 584 | push(@buildertype_list,$obt."Builder");
|
---|
| 585 | }
|
---|
| 586 | }
|
---|
| 587 |
|
---|
| 588 | # Set up array of the main builder.pm, followed by any ones
|
---|
| 589 | # from the extension folders
|
---|
| 590 |
|
---|
| 591 | my $num_builders = scalar(@buildertype_list);
|
---|
| 592 | my @builders = ();
|
---|
| 593 |
|
---|
| 594 | for (my $i=0; $i<$num_builders; $i++)
|
---|
| 595 | {
|
---|
| 596 | my $this_builder;
|
---|
| 597 | my $this_buildertype = $buildertype_list[$i];
|
---|
| 598 | my $this_builderdir = $builderdir_list[$i];
|
---|
| 599 |
|
---|
| 600 | if ((defined $this_builderdir) && ($this_builderdir ne ""))
|
---|
| 601 | {
|
---|
| 602 | require "$this_builderdir/$this_buildertype.pm";
|
---|
| 603 | }
|
---|
| 604 | else
|
---|
| 605 | {
|
---|
| 606 | require "$this_buildertype.pm";
|
---|
| 607 | }
|
---|
| 608 |
|
---|
| 609 | eval("\$this_builder = new $this_buildertype(\$site, \$collection, " .
|
---|
| 610 | "\$realarchivedir, \$realbuilddir, \$verbosity, " .
|
---|
| 611 | "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
|
---|
| 612 | "\$remove_empty_classifications, " .
|
---|
| 613 | "\$out, \$no_text, \$faillog, \$gli)");
|
---|
| 614 | die "$@" if $@;
|
---|
| 615 |
|
---|
| 616 | push(@builders,$this_builder);
|
---|
| 617 | }
|
---|
| 618 |
|
---|
| 619 | # Init phase for builders
|
---|
| 620 | for (my $i=0; $i<$num_builders; $i++)
|
---|
| 621 | {
|
---|
| 622 | my $this_buildertype = $buildertype_list[$i];
|
---|
| 623 | my $this_builderdir = $builderdir_list[$i];
|
---|
| 624 | my $this_builder = $builders[$i];
|
---|
| 625 |
|
---|
| 626 | $this_builder->init();
|
---|
| 627 | $this_builder->set_maxnumeric($maxnumeric);
|
---|
| 628 |
|
---|
| 629 | if (($this_buildertype eq "mgppbuilder") && $no_strip_html)
|
---|
| 630 | {
|
---|
| 631 | $this_builder->set_strip_html(0);
|
---|
| 632 | }
|
---|
| 633 |
|
---|
| 634 | if ($sections_index_document_metadata ne "never")
|
---|
| 635 | {
|
---|
| 636 | $this_builder->set_sections_index_document_metadata($sections_index_document_metadata);
|
---|
| 637 | }
|
---|
[28060] | 638 | if (($this_buildertype eq "lucenebuilder" || $this_buildertype eq "solrbuilder") && $sections_sort_on_document_metadata ne "never")
|
---|
[27563] | 639 | {
|
---|
| 640 | $this_builder->set_sections_sort_on_document_metadata($sections_sort_on_document_metadata);
|
---|
| 641 | }
|
---|
[27304] | 642 |
|
---|
| 643 | if ($store_metadata_coverage)
|
---|
| 644 | {
|
---|
| 645 | $this_builder->set_store_metadata_coverage(1);
|
---|
| 646 | }
|
---|
| 647 | }
|
---|
| 648 | return \@builders;
|
---|
| 649 | }
|
---|
| 650 |
|
---|
| 651 | sub build_collection
|
---|
| 652 | {
|
---|
| 653 | my $self = shift(@_);
|
---|
| 654 | my @builders = @{shift(@_)};
|
---|
| 655 |
|
---|
| 656 | my $indexlevel = $self->{'indexlevel'};
|
---|
| 657 | my $indexname = $self->{'indexname'};
|
---|
| 658 | my $mode = $self->{'mode'};
|
---|
| 659 | my $textindex = $self->{'textindex'};
|
---|
| 660 |
|
---|
| 661 | # Run the requested passes
|
---|
[28801] | 662 | if ($mode =~ /^(all|extra)$/i)
|
---|
[27304] | 663 | {
|
---|
| 664 | # 'map' modifies the elements of the original array, so calling
|
---|
| 665 | # methods -- as done below -- will cause (by default) @builders
|
---|
| 666 | # to be changed to whatever these functions return (which is *not*
|
---|
| 667 | # what we want -- we want to leave the values unchanged)
|
---|
| 668 | # => Use 'local' (dynamic scoping) to give each 'map' call its
|
---|
| 669 | # own local copy This could also be done with:
|
---|
| 670 | # (my $new =$_)->method(); $new
|
---|
| 671 | # but is a bit more cumbersome to write
|
---|
| 672 | map { local $_=$_; $_->compress_text($textindex); } @builders;
|
---|
| 673 | # - we pass the required indexname and indexlevel (if specified) to the
|
---|
| 674 | # processor [jmt12]
|
---|
| 675 | map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
|
---|
| 676 | map { local $_=$_; $_->make_infodatabase(); } @builders;
|
---|
| 677 | map { local $_=$_; $_->collect_specific(); } @builders;
|
---|
| 678 | }
|
---|
| 679 | elsif ($mode =~ /^compress_text$/i)
|
---|
| 680 | {
|
---|
| 681 | map { local $_=$_; $_->compress_text($textindex); } @builders;
|
---|
| 682 | }
|
---|
| 683 | elsif ($mode =~ /^build_index$/i)
|
---|
| 684 | {
|
---|
| 685 | map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
|
---|
| 686 | }
|
---|
| 687 | elsif ($mode =~ /^infodb$/i)
|
---|
| 688 | {
|
---|
| 689 | map { local $_=$_; $_->make_infodatabase(); } @builders;
|
---|
| 690 | }
|
---|
| 691 | else
|
---|
| 692 | {
|
---|
| 693 | (&gsprintf::gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
|
---|
| 694 | }
|
---|
| 695 | }
|
---|
| 696 | # build_collection()
|
---|
| 697 |
|
---|
| 698 | # @function build_auxiliary_files
|
---|
| 699 | #
|
---|
| 700 | sub build_auxiliary_files
|
---|
| 701 | {
|
---|
| 702 | my $self = shift(@_);
|
---|
| 703 | my @builders = @{shift(@_)};
|
---|
| 704 | if (!$self->{'debug'})
|
---|
| 705 | {
|
---|
| 706 | map {local $_=$_; $_->make_auxiliary_files(); } @builders;
|
---|
| 707 | }
|
---|
| 708 | }
|
---|
| 709 | # build_auxiliary_files()
|
---|
| 710 |
|
---|
| 711 | # @function complete_builders
|
---|
| 712 | #
|
---|
| 713 | sub complete_builders
|
---|
| 714 | {
|
---|
| 715 | my $self = shift(@_);
|
---|
| 716 | my @builders = @{shift(@_)};
|
---|
| 717 |
|
---|
| 718 | map {local $_=$_; $_->deinit(); } @builders;
|
---|
| 719 |
|
---|
| 720 | if (($self->{'realbuilddir'} ne $self->{'builddir'}) && !$self->{'debug'})
|
---|
| 721 | {
|
---|
| 722 | if ($self->{'verbosity'} >= 1)
|
---|
| 723 | {
|
---|
| 724 | &gsprintf::gsprintf($out, "{buildcol.copying_back_cached_build}\n");
|
---|
| 725 | }
|
---|
[28566] | 726 | &FileUtils::removeFilesRecursive($self->{'builddir'});
|
---|
| 727 | &FileUtils::copyFilesRecursive($self->{'realbuilddir'}, $self->{'builddir'});
|
---|
[27304] | 728 | }
|
---|
| 729 |
|
---|
| 730 | # for RSS support: Need rss-items.rdf file in index folder
|
---|
| 731 | # check if a file called rss-items.rdf exists in archives, then copy it into the building folder
|
---|
| 732 | # so that when building is moved to index, this file will then also be in index as desired
|
---|
| 733 | my $collection_dir = &util::resolve_collection_dir($self->{'collectdir'},
|
---|
| 734 | $self->{'collection'},
|
---|
| 735 | $self->{'site'});
|
---|
| 736 | my $rss_items_rdf_file = &FileUtils::filenameConcatenate($self->{'archivedir'}, 'rss-items.rdf');
|
---|
| 737 | # @todo FileUtils
|
---|
[27392] | 738 | if(defined $self->{'builddir'} && &FileUtils::directoryExists($self->{'builddir'}) && &FileUtils::fileExists($rss_items_rdf_file))
|
---|
[27304] | 739 | {
|
---|
[27392] | 740 | if ($self->{'verbosity'} >= 1)
|
---|
[27304] | 741 | {
|
---|
[28087] | 742 | my $archivedir_tail = "'".basename($self->{'archivedir'})."'";
|
---|
| 743 | my $builddir_tail = "'".basename($self->{'builddir'})."'";
|
---|
| 744 |
|
---|
| 745 | &gsprintf::gsprintf($self->{'out'}, "{buildcol.copying_rss_items_rdf}\n", $archivedir_tail, $builddir_tail);
|
---|
[27304] | 746 | }
|
---|
[27392] | 747 | &FileUtils::copyFiles($rss_items_rdf_file, $self->{'builddir'});
|
---|
[27304] | 748 | }
|
---|
| 749 |
|
---|
| 750 | if ($self->{'gli'})
|
---|
| 751 | {
|
---|
| 752 | print STDERR "</Build>\n";
|
---|
| 753 | }
|
---|
| 754 | }
|
---|
| 755 | # complete_builders()
|
---|
| 756 |
|
---|
| 757 | # @function activate_collection
|
---|
| 758 | #
|
---|
| 759 | sub activate_collection
|
---|
| 760 | {
|
---|
| 761 | my $self = shift(@_);
|
---|
| 762 | # if buildcol.pl was run with -activate, need to run activate.pl
|
---|
| 763 | # now that building's complete
|
---|
| 764 | if ($self->{'activate'})
|
---|
| 765 | {
|
---|
| 766 | #my $quoted_argv = join(" ", map { "\"$_\"" } @ARGV);
|
---|
| 767 | my @activate_argv = ();
|
---|
| 768 | push(@activate_argv, '-collectdir', $self->{'collectdir'}) if ($self->{'collectdir'});
|
---|
| 769 | push(@activate_argv, '-builddir', $self->{'builddir'}) if ($self->{'builddir'});
|
---|
| 770 | push(@activate_argv, '-site', $self->{'site'}) if ($self->{'site'});
|
---|
| 771 | push(@activate_argv, '-verbosity', $self->{'verbosity'}) if ($self->{'verbosity'});
|
---|
| 772 | push(@activate_argv, '-removeold') if ($self->{'removeold'});
|
---|
| 773 | push(@activate_argv, '-keepold') if ($self->{'keepold'});
|
---|
| 774 | push(@activate_argv, '-incremental') if ($self->{'incremental'});
|
---|
| 775 | my $quoted_argv = join(' ', map { "\"$_\"" } @activate_argv);
|
---|
[27791] | 776 | my $activatecol_cmd = '"' . &util::get_perl_exec(). '" -S activate.pl ' . $quoted_argv . ' "' . $self->get_collection() . '"';
|
---|
[27304] | 777 | my $activatecol_status = system($activatecol_cmd)/256;
|
---|
| 778 |
|
---|
| 779 | if ($activatecol_status != 0)
|
---|
| 780 | {
|
---|
| 781 | print STDERR "Error: Failed to run: $activatecol_cmd\n";
|
---|
| 782 | print STDERR " $!\n" if ($! ne '');
|
---|
| 783 | exit(-1);
|
---|
| 784 | }
|
---|
| 785 | }
|
---|
| 786 | }
|
---|
| 787 |
|
---|
| 788 | # @function deinit()
|
---|
| 789 | #
|
---|
| 790 | sub deinit
|
---|
| 791 | {
|
---|
| 792 | my $self = shift(@_);
|
---|
| 793 |
|
---|
| 794 | if ($self->{'close_out'})
|
---|
| 795 | {
|
---|
| 796 | close OUT;
|
---|
| 797 | }
|
---|
| 798 | if ($self->{'close_faillog'})
|
---|
| 799 | {
|
---|
| 800 | close FAILLOG;
|
---|
| 801 | }
|
---|
| 802 | }
|
---|
| 803 | # deinit()
|
---|
| 804 |
|
---|
| 805 | 1;
|
---|