[27304] | 1 | ###############################################################################
|
---|
| 2 | #
|
---|
| 3 | # buildcolutils.pm -- index and build the collection. The buildtime counterpart
|
---|
| 4 | # of inexport.pl
|
---|
| 5 | #
|
---|
| 6 | # A component of the Greenstone digital library software
|
---|
| 7 | # from the New Zealand Digital Library Project at the
|
---|
| 8 | # University of Waikato, New Zealand.
|
---|
| 9 | #
|
---|
| 10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | # You should have received a copy of the GNU General Public License
|
---|
| 23 | # along with this program; if not, write to the Free Software
|
---|
| 24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | #
|
---|
| 26 | ###############################################################################
|
---|
| 27 |
|
---|
| 28 | package buildcolutils;
|
---|
| 29 |
|
---|
| 30 | use colcfg;
|
---|
| 31 | use dbutil;
|
---|
| 32 | use util;
|
---|
| 33 | use FileUtils;
|
---|
| 34 | use scriptutil;
|
---|
| 35 | use gsprintf;
|
---|
| 36 | use printusage;
|
---|
| 37 | use parse2;
|
---|
| 38 |
|
---|
| 39 | ## @method new()
|
---|
| 40 | #
|
---|
| 41 | # Parses up and validates the arguments to the build process before creating
|
---|
| 42 | # the appropriate build process to do the actual work
|
---|
| 43 | #
|
---|
| 44 | # @note Added true incremental support - John Thompson, DL Consulting Ltd.
|
---|
| 45 | # @note There were several bugs regarding using directories other than
|
---|
| 46 | # "import" or "archives" during import and build quashed. - John
|
---|
| 47 | # Thompson, DL Consulting Ltd.
|
---|
| 48 | #
|
---|
| 49 | # @param $incremental If true indicates this build should not regenerate all
|
---|
| 50 | # the index and metadata files, and should instead just
|
---|
| 51 | # append the information found in the archives directory
|
---|
| 52 | # to the existing files. If this requires some complex
|
---|
| 53 | # work so as to correctly insert into a classifier so be
|
---|
| 54 | # it. Of course none of this is done here - instead the
|
---|
| 55 | # incremental argument is passed to the document
|
---|
| 56 | # processor.
|
---|
| 57 | #
|
---|
| 58 | sub new
|
---|
| 59 | {
|
---|
| 60 | my $class = shift(@_);
|
---|
| 61 | my ($argv, $options, $opt_listall_options) = @_;
|
---|
| 62 |
|
---|
| 63 | my $self = {'builddir' => undef,
|
---|
| 64 | 'buildtype' => undef,
|
---|
| 65 | 'close_faillog' => 0,
|
---|
| 66 | 'close_out' => 0,
|
---|
| 67 | 'mode' => '',
|
---|
| 68 | 'orthogonalbuildtypes' => undef,
|
---|
| 69 | 'realbuilddir' => undef,
|
---|
| 70 | 'textindex' => '',
|
---|
| 71 | 'xml' => 0
|
---|
| 72 | };
|
---|
| 73 |
|
---|
| 74 | # general options available to all plugins
|
---|
| 75 | my $arguments = $options->{'args'};
|
---|
| 76 | my $intArgLeftinAfterParsing = &parse2::parse($argv, $arguments, $self, "allow_extra_options");
|
---|
| 77 | # If parse returns -1 then something has gone wrong
|
---|
| 78 | if ($intArgLeftinAfterParsing == -1)
|
---|
| 79 | {
|
---|
| 80 | &PrintUsage::print_txt_usage($options, "{buildcol.params}");
|
---|
| 81 | die "\n";
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | # If $language has been specified, load the appropriate resource bundle
|
---|
| 85 | # (Otherwise, the default resource bundle will be loaded automatically)
|
---|
| 86 | if ($self->{'language'} && $self->{'language'} =~ /\S/)
|
---|
| 87 | {
|
---|
| 88 | &gsprintf::load_language_specific_resource_bundle($self->{'language'});
|
---|
| 89 | }
|
---|
| 90 |
|
---|
| 91 | # Do we need 'listall' support in buildcol? If so, copy code from inexport
|
---|
| 92 | # later [jmt12]
|
---|
| 93 |
|
---|
| 94 | # <insert explanation here>
|
---|
| 95 | if ($self->{'xml'})
|
---|
| 96 | {
|
---|
| 97 | &PrintUsage::print_xml_usage($options);
|
---|
| 98 | print "\n";
|
---|
| 99 | return bless($self, $class);
|
---|
| 100 | }
|
---|
| 101 |
|
---|
| 102 | # the gli wants strings to be in UTF-8
|
---|
| 103 | if ($gli)
|
---|
| 104 | {
|
---|
| 105 | &gsprintf::output_strings_in_UTF8;
|
---|
| 106 | }
|
---|
| 107 |
|
---|
| 108 | # now check that we had exactly one leftover arg, which should be
|
---|
| 109 | # the collection name. We don't want to do this earlier, cos
|
---|
| 110 | # -xml arg doesn't need a collection name
|
---|
| 111 | # Or if the user specified -h, then we output the usage also
|
---|
| 112 | if ($intArgLeftinAfterParsing != 1 || (@ARGV && $ARGV[0] =~ /^\-+h/))
|
---|
| 113 | {
|
---|
| 114 | &PrintUsage::print_txt_usage($options, "{buildcol.params}");
|
---|
| 115 | die "\n";
|
---|
| 116 | }
|
---|
| 117 |
|
---|
| 118 | my $out = $self->{'out'};
|
---|
| 119 | if ($out !~ /^(STDERR|STDOUT)$/i)
|
---|
| 120 | {
|
---|
| 121 | open (OUT, ">$out") || (&gsprintf::gsprintf(STDERR, "{common.cannot_open_output_file}\n", $out) && die);
|
---|
| 122 | $out = "buildcolutils::OUT";
|
---|
| 123 | $self->{'close_out'} = 1;
|
---|
| 124 | }
|
---|
| 125 | $out->autoflush(1);
|
---|
| 126 | $self->{'out'} = $out;
|
---|
| 127 |
|
---|
| 128 | # @ARGV should be only one item, the name of the collection
|
---|
| 129 | $self->{'collection'} = shift(@{$argv});
|
---|
| 130 |
|
---|
| 131 | return bless($self, $class);
|
---|
| 132 | }
|
---|
| 133 | # new()
|
---|
| 134 |
|
---|
| 135 | # newCGI()?
|
---|
| 136 |
|
---|
| 137 | # @function get_collection
|
---|
| 138 | #
|
---|
| 139 | sub get_collection
|
---|
| 140 | {
|
---|
| 141 | my $self = shift @_;
|
---|
| 142 | return $self->{'collection'};
|
---|
| 143 | }
|
---|
| 144 | # get_collection()
|
---|
| 145 |
|
---|
| 146 | # @function read_collection_cfg
|
---|
| 147 | #
|
---|
| 148 | sub read_collection_cfg
|
---|
| 149 | {
|
---|
| 150 | my $self = shift(@_);
|
---|
| 151 | my ($collection, $options) = @_;
|
---|
| 152 |
|
---|
| 153 | my $collectdir = $self->{'collectdir'};
|
---|
| 154 | my $site = $self->{'site'};
|
---|
| 155 | my $out = $self->{'out'};
|
---|
| 156 |
|
---|
| 157 | # get and check the collection
|
---|
| 158 | if (($collection = &colcfg::use_collection($site, $collection, $collectdir)) eq "")
|
---|
| 159 | {
|
---|
| 160 | &PrintUsage::print_txt_usage($options, "{buildcol.params}");
|
---|
| 161 | die "\n";
|
---|
| 162 | }
|
---|
| 163 |
|
---|
| 164 | # set gs_version 2/3
|
---|
| 165 | $self->{'gs_version'} = "2";
|
---|
| 166 | if ((defined $site) && ($site ne ""))
|
---|
| 167 | {
|
---|
| 168 | # gs3
|
---|
| 169 | $self->{'gs_version'} = "3";
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | # add collection's perllib dir into include path in case we have collection
|
---|
| 173 | # specific modules
|
---|
| 174 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib'));
|
---|
| 175 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'classify'));
|
---|
| 176 | &util::augmentINC(&FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'perllib', 'plugins'));
|
---|
| 177 |
|
---|
| 178 | # check that we can open the faillog
|
---|
| 179 | my $faillog = $self->{'faillog'};
|
---|
| 180 | if ($faillog eq "")
|
---|
| 181 | {
|
---|
| 182 | $faillog = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "etc", "fail.log");
|
---|
| 183 | }
|
---|
| 184 | # note that we're appending to the faillog here (import.pl clears it each time)
|
---|
| 185 | # this could potentially create a situation where the faillog keeps being added
|
---|
| 186 | # to over multiple builds (if the import process is being skipped)
|
---|
| 187 | open (FAILLOG, ">>$faillog") || (&gsprintf::gsprintf(STDERR, "{common.cannot_open_fail_log}\n", $faillog) && die);
|
---|
| 188 | $faillog = 'buildcolutils::FAILLOG';
|
---|
| 189 | $faillog->autoflush(1);
|
---|
| 190 | $self->{'faillog'} = $faillog;
|
---|
| 191 | $self->{'faillogname'} = $faillog;
|
---|
| 192 | $self->{'close_faillog'} = 1;
|
---|
| 193 |
|
---|
| 194 | # Read in the collection configuration file.
|
---|
| 195 | my $gs_mode = "gs".$self->{'gs_version'}; #gs2 or gs3
|
---|
| 196 | my $config_filename = &colcfg::get_collect_cfg_name($out, $gs_mode);
|
---|
| 197 | my $collect_cfg = &colcfg::read_collection_cfg($config_filename, $gs_mode);
|
---|
| 198 |
|
---|
| 199 | return ($config_filename, $collect_cfg);
|
---|
| 200 | }
|
---|
| 201 | # read_collection_cfg()
|
---|
| 202 |
|
---|
| 203 | # @function set_collection_options
|
---|
| 204 | # This function copies across values for arguments from the collection
|
---|
| 205 | # configuration file if they are not already provided by the user, then
|
---|
| 206 | # sets reasonable defaults for any required arguments that remains without
|
---|
| 207 | # a value.
|
---|
| 208 | sub set_collection_options
|
---|
| 209 | {
|
---|
| 210 | my $self = shift @_;
|
---|
| 211 | my ($collectcfg) = @_;
|
---|
| 212 | my ($buildtype, $orthogonalbuildtypes);
|
---|
| 213 |
|
---|
| 214 | # If the infodbtype value wasn't defined in the collect.cfg file, use the default
|
---|
| 215 | if (!defined($collectcfg->{'infodbtype'}))
|
---|
| 216 | {
|
---|
| 217 | $collectcfg->{'infodbtype'} = &dbutil::get_default_infodb_type();
|
---|
| 218 | }
|
---|
| 219 | # - just so I don't have to pass collectcfg around as well
|
---|
| 220 | $self->{'infodbtype'} = $collectcfg->{'infodbtype'};
|
---|
| 221 |
|
---|
| 222 | if ($self->{'verbosity'} !~ /\d+/)
|
---|
| 223 | {
|
---|
| 224 | if (defined $collectcfg->{'verbosity'} && $collectcfg->{'verbosity'} =~ /\d+/)
|
---|
| 225 | {
|
---|
| 226 | $self->{'verbosity'} = $collectcfg->{'verbosity'};
|
---|
| 227 | }
|
---|
| 228 | else
|
---|
| 229 | {
|
---|
| 230 | $self->{'verbosity'} = 2; # the default
|
---|
| 231 | }
|
---|
| 232 | }
|
---|
| 233 |
|
---|
| 234 | # we use searchtype for determining buildtype, but for old versions, use buildtype
|
---|
| 235 | if (defined $collectcfg->{'buildtype'})
|
---|
| 236 | {
|
---|
| 237 | $self->{'buildtype'} = $collectcfg->{'buildtype'};
|
---|
| 238 | }
|
---|
| 239 | elsif (defined $collectcfg->{'searchtypes'} || defined $collectcfg->{'searchtype'})
|
---|
| 240 | {
|
---|
| 241 | $self->{'buildtype'} = "mgpp";
|
---|
| 242 | }
|
---|
| 243 | else
|
---|
| 244 | {
|
---|
| 245 | $self->{'buildtype'} = "mg"; #mg is the default
|
---|
| 246 | }
|
---|
| 247 |
|
---|
| 248 | if ($self->{'buildtype'} eq "mgpp" && defined $collectcfg->{'textcompress'})
|
---|
| 249 | {
|
---|
| 250 | $self->{'textindex'} = $collectcfg->{'textcompress'};
|
---|
| 251 | }
|
---|
| 252 |
|
---|
| 253 | # is it okay to always clobber or possible remain undefined? [jmt12]
|
---|
| 254 | if (defined $collectcfg->{'orthogonalbuildtypes'})
|
---|
| 255 | {
|
---|
| 256 | $self->{'orthogonalbuildtypes'} = $collectcfg->{'orthogonalbuildtypes'};
|
---|
| 257 | }
|
---|
| 258 |
|
---|
[27392] | 259 | # - resolve (and possibly set to default) builddir
|
---|
[27304] | 260 | if (defined $collectcfg->{'archivedir'} && $self->{'archivedir'} eq "")
|
---|
| 261 | {
|
---|
| 262 | $self->{'archivedir'} = $collectcfg->{'archivedir'};
|
---|
| 263 | }
|
---|
[27392] | 264 | # Modified so that the archivedir, if provided as an argument, is made
|
---|
| 265 | # absolute if it isn't already
|
---|
| 266 | if ($self->{'archivedir'} eq "")
|
---|
[27304] | 267 | {
|
---|
[27392] | 268 | $self->{'archivedir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
|
---|
[27304] | 269 | }
|
---|
[27392] | 270 | else
|
---|
| 271 | {
|
---|
| 272 | $self->{'archivedir'} = &util::make_absolute($ENV{'GSDLCOLLECTDIR'}, $archivedir);
|
---|
| 273 | }
|
---|
| 274 | # End Mod
|
---|
[27482] | 275 | $self->{'archivedir'} = &FileUtils::sanitizePath($self->{'archivedir'});
|
---|
| 276 | #$self->{'archivedir'} =~ s/[\\\/]+/\//g;
|
---|
| 277 | #$self->{'archivedir'} =~ s/\/$//;
|
---|
[27304] | 278 |
|
---|
[27392] | 279 | # - resolve (and possibly set to default) builddir
|
---|
[27304] | 280 | if (defined $collectcfg->{'builddir'} && $self->{'builddir'} eq "")
|
---|
| 281 | {
|
---|
| 282 | $self->{'builddir'} = $collectcfg->{'builddir'};
|
---|
| 283 | }
|
---|
[27392] | 284 | if ($self->{'builddir'} eq "")
|
---|
| 285 | {
|
---|
| 286 | $self->{'builddir'} = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, 'building');
|
---|
| 287 | if ($incremental)
|
---|
| 288 | {
|
---|
| 289 | &gsprintf::gsprintf($out, "{buildcol.incremental_default_builddir}\n");
|
---|
| 290 | }
|
---|
| 291 | }
|
---|
| 292 | # - why don't we make builddir absolute similar to archivedir?
|
---|
[27482] | 293 | $self->{'builddir'} = &FileUtils::sanitizePath($self->{'builddir'});
|
---|
| 294 | #$self->{'builddir'} =~ s/[\\\/]+/\//g;
|
---|
| 295 | #$self->{'builddir'} =~ s/\/$//;
|
---|
[27304] | 296 |
|
---|
[27392] | 297 | if (defined $collectcfg->{'cachedir'} && $self->{'cachedir'} eq "")
|
---|
| 298 | {
|
---|
| 299 | $self->{'cachedir'} = $collectcfg->{'cachedir'};
|
---|
| 300 | }
|
---|
| 301 |
|
---|
[27304] | 302 | if ($self->{'maxdocs'} !~ /\-?\d+/)
|
---|
| 303 | {
|
---|
| 304 | if (defined $collectcfg->{'maxdocs'} && $collectcfg->{'maxdocs'} =~ /\-?\d+/)
|
---|
| 305 | {
|
---|
| 306 | $self->{'maxdocs'} = $collectcfg->{'maxdocs'};
|
---|
| 307 | }
|
---|
| 308 | else
|
---|
| 309 | {
|
---|
| 310 | $self->{'maxdocs'} = -1; # the default
|
---|
| 311 | }
|
---|
| 312 | }
|
---|
| 313 |
|
---|
| 314 | # always clobbers? [jmt12]
|
---|
| 315 | if (defined $collectcfg->{'maxnumeric'} && $collectcfg->{'maxnumeric'} =~ /\d+/)
|
---|
| 316 | {
|
---|
| 317 | $self->{'maxnumeric'} = $collectcfg->{'maxnumeric'};
|
---|
| 318 | }
|
---|
| 319 | if ($self->{'maxnumeric'} < 4 || $self->{'maxnumeric'} > 512)
|
---|
| 320 | {
|
---|
| 321 | $self->{'maxnumeric'} = 4;
|
---|
| 322 | }
|
---|
| 323 |
|
---|
| 324 | if (defined $collectcfg->{'debug'} && $collectcfg->{'debug'} =~ /^true$/i)
|
---|
| 325 | {
|
---|
| 326 | $self->{'debug'} = 1;
|
---|
| 327 | }
|
---|
| 328 |
|
---|
| 329 | if ($self->{'mode'} !~ /^(all|compress_text|build_index|infodb)$/)
|
---|
| 330 | {
|
---|
| 331 | if (defined $collectcfg->{'mode'} && $collectcfg->{'mode'} =~ /^(all|compress_text|build_index|infodb)$/)
|
---|
| 332 | {
|
---|
| 333 | $self->{'mode'} = $collectcfg->{'mode'};
|
---|
| 334 | }
|
---|
| 335 | else
|
---|
| 336 | {
|
---|
| 337 | $self->{'mode'} = "all"; # the default
|
---|
| 338 | }
|
---|
| 339 | }
|
---|
| 340 |
|
---|
| 341 | # Presumably 'index' from the collect.cfg still works [jmt12]
|
---|
| 342 | if (defined $collectcfg->{'index'} && $self->{'indexname'} eq "")
|
---|
| 343 | {
|
---|
| 344 | $self->{'indexname'} = $collectcfg->{'index'};
|
---|
| 345 | }
|
---|
| 346 | # - 'index' from the command line doesn't make it through parsing so I
|
---|
| 347 | # renamed this option 'indexname' [jmt12]
|
---|
| 348 | if (defined $collectcfg->{'indexname'} && $self->{'indexname'} eq "")
|
---|
| 349 | {
|
---|
| 350 | $self->{'indexname'} = $collectcfg->{'indexname'};
|
---|
| 351 | }
|
---|
| 352 | # - we may also define the index level to build now [jmt12]
|
---|
| 353 | if (defined $collectcfg->{'indexlevel'} && $self->{'indexlevel'} eq "")
|
---|
| 354 | {
|
---|
| 355 | $self->{'indexlevel'} = $collectcfg->{'indexlevel'};
|
---|
| 356 | }
|
---|
| 357 |
|
---|
| 358 | if (defined $collectcfg->{'no_text'} && $self->{'no_text'} == 0)
|
---|
| 359 | {
|
---|
| 360 | if ($collectcfg->{'no_text'} =~ /^true$/i)
|
---|
| 361 | {
|
---|
| 362 | $self->{'no_text'} = 1;
|
---|
| 363 | }
|
---|
| 364 | }
|
---|
| 365 |
|
---|
| 366 | if (defined $collectcfg->{'no_strip_html'} && $self->{'no_strip_html'} == 0)
|
---|
| 367 | {
|
---|
| 368 | if ($collectcfg->{'no_strip_html'} =~ /^true$/i)
|
---|
| 369 | {
|
---|
| 370 | $self->{'no_strip_html'} = 1;
|
---|
| 371 | }
|
---|
| 372 | }
|
---|
| 373 |
|
---|
| 374 | if (defined $collectcfg->{'store_metadata_coverage'} && $self->{'store_metadata_coverage'} == 0)
|
---|
| 375 | {
|
---|
| 376 | if ($collectcfg->{'store_metadata_coverage'} =~ /^true$/i)
|
---|
| 377 | {
|
---|
| 378 | $self->{'store_metadata_coverage'} = 1;
|
---|
| 379 | }
|
---|
| 380 | }
|
---|
| 381 |
|
---|
| 382 | if (defined $collectcfg->{'remove_empty_classifications'} && $self->{'remove_empty_classifications'} == 0)
|
---|
| 383 | {
|
---|
| 384 | if ($collectcfg->{'remove_empty_classifications'} =~ /^true$/i)
|
---|
| 385 | {
|
---|
| 386 | $self->{'remove_empty_classifications'} = 1;
|
---|
| 387 | }
|
---|
| 388 | }
|
---|
| 389 |
|
---|
| 390 | if (defined $collectcfg->{'gli'} && $collectcfg->{'gli'} =~ /^true$/i)
|
---|
| 391 | {
|
---|
| 392 | $self->{'gli'} = 1;
|
---|
| 393 | }
|
---|
| 394 | if (!defined $self->{'gli'})
|
---|
| 395 | {
|
---|
| 396 | $self->{'gli'} = 0;
|
---|
| 397 | }
|
---|
| 398 |
|
---|
| 399 | if ($self->{'sections_index_document_metadata'} !~ /\S/ && defined $collectcfg->{'sections_index_document_metadata'})
|
---|
| 400 | {
|
---|
| 401 | $self->{'sections_index_document_metadata'} = $collectcfg->{'sections_index_document_metadata'};
|
---|
| 402 | }
|
---|
| 403 |
|
---|
| 404 | if ($self->{'sections_index_document_metadata'} !~ /^(never|always|unless_section_metadata_exists)$/) {
|
---|
| 405 | $self->{'sections_index_document_metadata'} = 'never';
|
---|
| 406 | }
|
---|
| 407 |
|
---|
[27563] | 408 | if ($self->{'sections_sort_on_document_metadata'} !~ /\S/ && defined $collectcfg->{'sections_sort_on_document_metadata'})
|
---|
| 409 | {
|
---|
| 410 | $self->{'sections_sort_on_document_metadata'} = $collectcfg->{'sections_sort_on_document_metadata'};
|
---|
| 411 | }
|
---|
| 412 |
|
---|
| 413 | if ($self->{'sections_sort_on_document_metadata'} !~ /^(never|always|unless_section_metadata_exists)$/) {
|
---|
| 414 | $self->{'sections_sort_on_document_metadata'} = 'never';
|
---|
| 415 | }
|
---|
| 416 |
|
---|
[27304] | 417 | my ($removeold, $keepold, $incremental, $incremental_mode)
|
---|
| 418 | = &scriptutil::check_removeold_and_keepold($self->{'removeold'}, $self->{'keepold'},
|
---|
| 419 | $self->{'incremental'}, 'building',
|
---|
| 420 | $collectcfg);
|
---|
| 421 | $self->{'removeold'} = $removeold;
|
---|
| 422 | $self->{'keepold'} = $keepold;
|
---|
| 423 | $self->{'incremental'} = $incremental;
|
---|
| 424 | $self->{'incremental_mode'} = $incremental_mode;
|
---|
| 425 |
|
---|
| 426 | # New argument to track whether build is incremental
|
---|
| 427 | if (!defined $self->{'incremental'})
|
---|
| 428 | {
|
---|
| 429 | $self->{'incremental'} = 0;
|
---|
| 430 | }
|
---|
| 431 |
|
---|
| 432 | #set the text index
|
---|
| 433 | if (($self->{'buildtype'} eq 'mgpp') || ($self->{'buildtype'} eq 'lucene') || ($self->{'buildtype'} eq 'solr'))
|
---|
| 434 | {
|
---|
| 435 | if ($self->{'textindex'} eq '')
|
---|
| 436 | {
|
---|
| 437 | $self->{'textindex'} = 'text';
|
---|
| 438 | }
|
---|
| 439 | }
|
---|
| 440 | else
|
---|
| 441 | {
|
---|
| 442 | $self->{'textindex'} = 'section:text';
|
---|
| 443 | }
|
---|
| 444 | }
|
---|
| 445 | # set_collection_options()
|
---|
| 446 |
|
---|
| 447 | # @function prepare_builders
|
---|
| 448 | #
|
---|
| 449 | sub prepare_builders
|
---|
| 450 | {
|
---|
| 451 | my $self = shift @_;
|
---|
| 452 | my ($config_filename,$collectcfg) = @_;
|
---|
| 453 |
|
---|
| 454 | my $archivedir = $self->{'archivedir'};
|
---|
| 455 | my $builddir = $self->{'builddir'};
|
---|
| 456 | my $buildtype = $self->{'buildtype'};
|
---|
| 457 | my $cachedir = $self->{'cachedir'};
|
---|
| 458 | my $collectdir = $self->{'collectdir'};
|
---|
| 459 | my $collection = $self->{'collection'};
|
---|
| 460 | my $debug = $self->{'debug'};
|
---|
| 461 | my $faillog = $self->{'faillog'};
|
---|
| 462 | my $gli = $self->{'gli'};
|
---|
| 463 | my $incremental = $self->{'incremental'};
|
---|
| 464 | my $incremental_mode = $self->{'incremental_mode'};
|
---|
| 465 | my $keepold = $self->{'keepold'};
|
---|
| 466 | my $maxdocs = $self->{'maxdocs'};
|
---|
| 467 | my $maxnumeric = $self->{'maxnumeric'};
|
---|
| 468 | my $no_strip_html = $self->{'no_strip_html'};
|
---|
| 469 | my $no_text = $self->{'no_text'};
|
---|
| 470 | my $orthogonalbuildtypes = $self->{'orthogonalbuildtypes'};
|
---|
| 471 | my $out = $self->{'out'};
|
---|
| 472 | my $remove_empty_classifications = $self->{'remove_empty_classifications'};
|
---|
| 473 | my $sections_index_document_metadata = $self->{'sections_index_document_metadata'};
|
---|
[27563] | 474 | my $sections_sort_on_document_metadata = $self->{'sections_sort_on_document_metadata'};
|
---|
[27304] | 475 | my $site = $self->{'site'};
|
---|
| 476 | my $store_metadata_coverage = $self->{'store_metadata_coverage'};
|
---|
| 477 | my $verbosity = $self->{'verbosity'};
|
---|
| 478 |
|
---|
| 479 | if ($gli)
|
---|
| 480 | {
|
---|
| 481 | print STDERR "<Build>\n";
|
---|
| 482 | }
|
---|
| 483 |
|
---|
| 484 | # fill in the default archives and building directories if none
|
---|
| 485 | # were supplied, turn all \ into / and remove trailing /
|
---|
| 486 |
|
---|
| 487 | my ($realarchivedir, $realbuilddir);
|
---|
| 488 | # update the archive cache if needed
|
---|
| 489 | if ($cachedir)
|
---|
| 490 | {
|
---|
| 491 | if ($verbosity >= 1)
|
---|
| 492 | {
|
---|
| 493 | &gsprintf::gsprintf($out, "{buildcol.updating_archive_cache}\n")
|
---|
| 494 | }
|
---|
| 495 |
|
---|
| 496 | $cachedir =~ s/[\\\/]+$//;
|
---|
| 497 | if ($cachedir !~ /collect[\/\\]$collection/)
|
---|
| 498 | {
|
---|
| 499 | $cachedir = &FileUtils::filenameConcatenate($cachedir, 'collect', $collection);
|
---|
| 500 | }
|
---|
| 501 |
|
---|
| 502 | $realarchivedir = &FileUtils::filenameConcatenate($cachedir, 'archives');
|
---|
| 503 | $realbuilddir = &FileUtils::filenameConcatenate($cachedir, 'building');
|
---|
| 504 | &FileUtils::makeAllDirectories($realarchivedir);
|
---|
| 505 | &FileUtils::makeAllDirectories($realbuilddir);
|
---|
| 506 | &util::cachedir($archivedir, $realarchivedir, $verbosity);
|
---|
| 507 | }
|
---|
| 508 | else
|
---|
| 509 | {
|
---|
| 510 | $realarchivedir = $archivedir;
|
---|
| 511 | $realbuilddir = $builddir;
|
---|
| 512 | }
|
---|
[27392] | 513 | $self->{'realarchivedir'} = $realarchivedir;
|
---|
[27304] | 514 | $self->{'realbuilddir'} = $realbuilddir;
|
---|
| 515 |
|
---|
| 516 | # build it in realbuilddir
|
---|
| 517 | &FileUtils::makeAllDirectories($realbuilddir);
|
---|
| 518 |
|
---|
| 519 | my ($buildertype, $builderdir, $builder);
|
---|
| 520 | # if a builder class has been created for this collection, use it
|
---|
| 521 | # otherwise, use the mg or mgpp builder
|
---|
| 522 | if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuilder.pm")
|
---|
| 523 | {
|
---|
| 524 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
|
---|
| 525 | $buildertype = "custombuilder";
|
---|
| 526 | }
|
---|
| 527 | elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuilder.pm")
|
---|
| 528 | {
|
---|
| 529 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
|
---|
| 530 | $buildertype = "custombuilder";
|
---|
| 531 | }
|
---|
| 532 | elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}builder.pm")
|
---|
| 533 | {
|
---|
| 534 | $builderdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
|
---|
| 535 | $buildertype = $collection . 'builder';
|
---|
| 536 | }
|
---|
| 537 | else
|
---|
| 538 | {
|
---|
| 539 | $builderdir = undef;
|
---|
| 540 | if ($buildtype ne '')
|
---|
| 541 | {
|
---|
| 542 | # caters for extension-based build types, such as 'solr'
|
---|
| 543 | $buildertype = $buildtype . 'builder';
|
---|
| 544 | }
|
---|
| 545 | else
|
---|
| 546 | {
|
---|
| 547 | # Default to mgpp
|
---|
| 548 | $buildertype = 'mgppbuilder';
|
---|
| 549 | }
|
---|
| 550 | }
|
---|
| 551 | # check for extension specific builders
|
---|
| 552 | # (that will then be run after main builder.pm
|
---|
| 553 | my @builderdir_list = ($builderdir);
|
---|
| 554 | my @buildertype_list = ($buildertype);
|
---|
| 555 |
|
---|
| 556 | if (defined $orthogonalbuildtypes)
|
---|
| 557 | {
|
---|
| 558 | foreach my $obt (@$orthogonalbuildtypes)
|
---|
| 559 | {
|
---|
| 560 | push(@builderdir_list,undef); # rely on @INC to find it
|
---|
| 561 | push(@buildertype_list,$obt."Builder");
|
---|
| 562 | }
|
---|
| 563 | }
|
---|
| 564 |
|
---|
| 565 | # Set up array of the main builder.pm, followed by any ones
|
---|
| 566 | # from the extension folders
|
---|
| 567 |
|
---|
| 568 | my $num_builders = scalar(@buildertype_list);
|
---|
| 569 | my @builders = ();
|
---|
| 570 |
|
---|
| 571 | for (my $i=0; $i<$num_builders; $i++)
|
---|
| 572 | {
|
---|
| 573 | my $this_builder;
|
---|
| 574 | my $this_buildertype = $buildertype_list[$i];
|
---|
| 575 | my $this_builderdir = $builderdir_list[$i];
|
---|
| 576 |
|
---|
| 577 | if ((defined $this_builderdir) && ($this_builderdir ne ""))
|
---|
| 578 | {
|
---|
| 579 | require "$this_builderdir/$this_buildertype.pm";
|
---|
| 580 | }
|
---|
| 581 | else
|
---|
| 582 | {
|
---|
| 583 | require "$this_buildertype.pm";
|
---|
| 584 | }
|
---|
| 585 |
|
---|
| 586 | eval("\$this_builder = new $this_buildertype(\$site, \$collection, " .
|
---|
| 587 | "\$realarchivedir, \$realbuilddir, \$verbosity, " .
|
---|
| 588 | "\$maxdocs, \$debug, \$keepold, \$incremental, \$incremental_mode, " .
|
---|
| 589 | "\$remove_empty_classifications, " .
|
---|
| 590 | "\$out, \$no_text, \$faillog, \$gli)");
|
---|
| 591 | die "$@" if $@;
|
---|
| 592 |
|
---|
| 593 | push(@builders,$this_builder);
|
---|
| 594 | }
|
---|
| 595 |
|
---|
| 596 | # Init phase for builders
|
---|
| 597 | for (my $i=0; $i<$num_builders; $i++)
|
---|
| 598 | {
|
---|
| 599 | my $this_buildertype = $buildertype_list[$i];
|
---|
| 600 | my $this_builderdir = $builderdir_list[$i];
|
---|
| 601 | my $this_builder = $builders[$i];
|
---|
| 602 |
|
---|
| 603 | $this_builder->init();
|
---|
| 604 | $this_builder->set_maxnumeric($maxnumeric);
|
---|
| 605 |
|
---|
| 606 | if (($this_buildertype eq "mgppbuilder") && $no_strip_html)
|
---|
| 607 | {
|
---|
| 608 | $this_builder->set_strip_html(0);
|
---|
| 609 | }
|
---|
| 610 |
|
---|
| 611 | if ($sections_index_document_metadata ne "never")
|
---|
| 612 | {
|
---|
| 613 | $this_builder->set_sections_index_document_metadata($sections_index_document_metadata);
|
---|
| 614 | }
|
---|
[27563] | 615 | if ($this_buildertype eq "lucenebuilder" && $sections_sort_on_document_metadata ne "never")
|
---|
| 616 | {
|
---|
| 617 | $this_builder->set_sections_sort_on_document_metadata($sections_sort_on_document_metadata);
|
---|
| 618 | }
|
---|
[27304] | 619 |
|
---|
| 620 | if ($store_metadata_coverage)
|
---|
| 621 | {
|
---|
| 622 | $this_builder->set_store_metadata_coverage(1);
|
---|
| 623 | }
|
---|
| 624 | }
|
---|
| 625 | return \@builders;
|
---|
| 626 | }
|
---|
| 627 |
|
---|
| 628 | sub build_collection
|
---|
| 629 | {
|
---|
| 630 | my $self = shift(@_);
|
---|
| 631 | my @builders = @{shift(@_)};
|
---|
| 632 |
|
---|
| 633 | my $indexlevel = $self->{'indexlevel'};
|
---|
| 634 | my $indexname = $self->{'indexname'};
|
---|
| 635 | my $mode = $self->{'mode'};
|
---|
| 636 | my $textindex = $self->{'textindex'};
|
---|
| 637 |
|
---|
| 638 | # Run the requested passes
|
---|
| 639 | if ($mode =~ /^all$/i)
|
---|
| 640 | {
|
---|
| 641 | # 'map' modifies the elements of the original array, so calling
|
---|
| 642 | # methods -- as done below -- will cause (by default) @builders
|
---|
| 643 | # to be changed to whatever these functions return (which is *not*
|
---|
| 644 | # what we want -- we want to leave the values unchanged)
|
---|
| 645 | # => Use 'local' (dynamic scoping) to give each 'map' call its
|
---|
| 646 | # own local copy This could also be done with:
|
---|
| 647 | # (my $new =$_)->method(); $new
|
---|
| 648 | # but is a bit more cumbersome to write
|
---|
| 649 | map { local $_=$_; $_->compress_text($textindex); } @builders;
|
---|
| 650 | # - we pass the required indexname and indexlevel (if specified) to the
|
---|
| 651 | # processor [jmt12]
|
---|
| 652 | map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
|
---|
| 653 | map { local $_=$_; $_->make_infodatabase(); } @builders;
|
---|
| 654 | map { local $_=$_; $_->collect_specific(); } @builders;
|
---|
| 655 | }
|
---|
| 656 | elsif ($mode =~ /^compress_text$/i)
|
---|
| 657 | {
|
---|
| 658 | map { local $_=$_; $_->compress_text($textindex); } @builders;
|
---|
| 659 | }
|
---|
| 660 | elsif ($mode =~ /^build_index$/i)
|
---|
| 661 | {
|
---|
| 662 | map { local $_=$_; $_->build_indexes($indexname, $indexlevel); } @builders;
|
---|
| 663 | }
|
---|
| 664 | elsif ($mode =~ /^infodb$/i)
|
---|
| 665 | {
|
---|
| 666 | map { local $_=$_; $_->make_infodatabase(); } @builders;
|
---|
| 667 | }
|
---|
| 668 | else
|
---|
| 669 | {
|
---|
| 670 | (&gsprintf::gsprintf(STDERR, "{buildcol.unknown_mode}\n", $mode) && die);
|
---|
| 671 | }
|
---|
| 672 | }
|
---|
| 673 | # build_collection()
|
---|
| 674 |
|
---|
| 675 | # @function build_auxiliary_files
|
---|
| 676 | #
|
---|
| 677 | sub build_auxiliary_files
|
---|
| 678 | {
|
---|
| 679 | my $self = shift(@_);
|
---|
| 680 | my @builders = @{shift(@_)};
|
---|
| 681 | if (!$self->{'debug'})
|
---|
| 682 | {
|
---|
| 683 | map {local $_=$_; $_->make_auxiliary_files(); } @builders;
|
---|
| 684 | }
|
---|
| 685 | }
|
---|
| 686 | # build_auxiliary_files()
|
---|
| 687 |
|
---|
| 688 | # @function complete_builders
|
---|
| 689 | #
|
---|
| 690 | sub complete_builders
|
---|
| 691 | {
|
---|
| 692 | my $self = shift(@_);
|
---|
| 693 | my @builders = @{shift(@_)};
|
---|
| 694 |
|
---|
| 695 | map {local $_=$_; $_->deinit(); } @builders;
|
---|
| 696 |
|
---|
| 697 | if (($self->{'realbuilddir'} ne $self->{'builddir'}) && !$self->{'debug'})
|
---|
| 698 | {
|
---|
| 699 | if ($self->{'verbosity'} >= 1)
|
---|
| 700 | {
|
---|
| 701 | &gsprintf::gsprintf($out, "{buildcol.copying_back_cached_build}\n");
|
---|
| 702 | }
|
---|
| 703 | &util::rm_r($self->{'builddir'});
|
---|
| 704 | &util::cp_r($self->{'realbuilddir'}, $self->{'builddir'});
|
---|
| 705 | }
|
---|
| 706 |
|
---|
| 707 | # for RSS support: Need rss-items.rdf file in index folder
|
---|
| 708 | # check if a file called rss-items.rdf exists in archives, then copy it into the building folder
|
---|
| 709 | # so that when building is moved to index, this file will then also be in index as desired
|
---|
| 710 | my $collection_dir = &util::resolve_collection_dir($self->{'collectdir'},
|
---|
| 711 | $self->{'collection'},
|
---|
| 712 | $self->{'site'});
|
---|
| 713 | my $rss_items_rdf_file = &FileUtils::filenameConcatenate($self->{'archivedir'}, 'rss-items.rdf');
|
---|
| 714 | # @todo FileUtils
|
---|
[27392] | 715 | if(defined $self->{'builddir'} && &FileUtils::directoryExists($self->{'builddir'}) && &FileUtils::fileExists($rss_items_rdf_file))
|
---|
[27304] | 716 | {
|
---|
[27392] | 717 | if ($self->{'verbosity'} >= 1)
|
---|
[27304] | 718 | {
|
---|
| 719 | &gsprintf::gsprintf($self->{'out'}, "{buildcol.copying_rss_items_rdf}\n");
|
---|
| 720 | }
|
---|
[27392] | 721 | &FileUtils::copyFiles($rss_items_rdf_file, $self->{'builddir'});
|
---|
[27304] | 722 | }
|
---|
| 723 |
|
---|
| 724 | if ($self->{'gli'})
|
---|
| 725 | {
|
---|
| 726 | print STDERR "</Build>\n";
|
---|
| 727 | }
|
---|
| 728 | }
|
---|
| 729 | # complete_builders()
|
---|
| 730 |
|
---|
| 731 | # @function activate_collection
|
---|
| 732 | #
|
---|
| 733 | sub activate_collection
|
---|
| 734 | {
|
---|
| 735 | my $self = shift(@_);
|
---|
| 736 | # if buildcol.pl was run with -activate, need to run activate.pl
|
---|
| 737 | # now that building's complete
|
---|
| 738 | if ($self->{'activate'})
|
---|
| 739 | {
|
---|
| 740 | #my $quoted_argv = join(" ", map { "\"$_\"" } @ARGV);
|
---|
| 741 | my @activate_argv = ();
|
---|
| 742 | push(@activate_argv, '-collectdir', $self->{'collectdir'}) if ($self->{'collectdir'});
|
---|
| 743 | push(@activate_argv, '-builddir', $self->{'builddir'}) if ($self->{'builddir'});
|
---|
| 744 | push(@activate_argv, '-site', $self->{'site'}) if ($self->{'site'});
|
---|
| 745 | push(@activate_argv, '-verbosity', $self->{'verbosity'}) if ($self->{'verbosity'});
|
---|
| 746 | push(@activate_argv, '-removeold') if ($self->{'removeold'});
|
---|
| 747 | push(@activate_argv, '-keepold') if ($self->{'keepold'});
|
---|
| 748 | push(@activate_argv, '-incremental') if ($self->{'incremental'});
|
---|
| 749 | my $quoted_argv = join(' ', map { "\"$_\"" } @activate_argv);
|
---|
| 750 | my $activatecol_cmd = '"' . &util::get_perl_exec(). '" -S activate.pl ' . $quoted_argv . ' "' . $collection . '"';
|
---|
| 751 | my $activatecol_status = system($activatecol_cmd)/256;
|
---|
| 752 |
|
---|
| 753 | if ($activatecol_status != 0)
|
---|
| 754 | {
|
---|
| 755 | print STDERR "Error: Failed to run: $activatecol_cmd\n";
|
---|
| 756 | print STDERR " $!\n" if ($! ne '');
|
---|
| 757 | exit(-1);
|
---|
| 758 | }
|
---|
| 759 | }
|
---|
| 760 | }
|
---|
| 761 |
|
---|
| 762 | # @function deinit()
|
---|
| 763 | #
|
---|
| 764 | sub deinit
|
---|
| 765 | {
|
---|
| 766 | my $self = shift(@_);
|
---|
| 767 |
|
---|
| 768 | if ($self->{'close_out'})
|
---|
| 769 | {
|
---|
| 770 | close OUT;
|
---|
| 771 | }
|
---|
| 772 | if ($self->{'close_faillog'})
|
---|
| 773 | {
|
---|
| 774 | close FAILLOG;
|
---|
| 775 | }
|
---|
| 776 | }
|
---|
| 777 | # deinit()
|
---|
| 778 |
|
---|
| 779 | 1;
|
---|