source: main/trunk/greenstone2/perllib/basebuilder.pm@ 32594

Last change on this file since 32594 was 32539, checked in by ak19, 5 years ago

New plugin parameter site_name (only set for GS3) that is passed to plugin::load_plugins() (but not to plugin::load_plugin_for_info()/gsdlinfo mode) by inexport.pm during import.pl and by basebuilder.pm during buildcol.pl. Like the gs_version parameter, it is parsed by plugins/PrintInfo.pm and will appear before gs_version (to preserve the way things were being parsed until now)

  • Property svn:keywords set to Author Date Id Revision
File size: 27.1 KB
Line 
1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use arcinfo;
32use classify;
33use cfgread;
34use colcfg;
35use dbutil;
36use oaiinfo;
37use plugin;
38use util;
39use FileUtils;
40
41
42BEGIN {
43 # set autoflush on for STDERR and STDOUT so that mgpp
44 # doesn't get out of sync with plugins
45 STDOUT->autoflush(1);
46 STDERR->autoflush(1);
47}
48
49END {
50 STDOUT->autoflush(0);
51 STDERR->autoflush(0);
52}
53
54our $maxdocsize = 12000;
55
56# used to signify "gs2"(default) or "gs3"
57our $gs_mode = "gs2";
58
59sub new {
60 my ($class, $site, $collection, $source_dir, $build_dir, $verbosity,
61 $maxdocs, $debug, $keepold, $incremental, $incremental_mode,
62 $remove_empty_classifications,
63 $outhandle, $no_text, $failhandle, $gli) = @_;
64
65 $outhandle = *STDERR unless defined $outhandle;
66 $no_text = 0 unless defined $no_text;
67 $failhandle = *STDERR unless defined $failhandle;
68
69 # create a builder object
70 my $self = bless {'site'=>$site, # will be undef for Greenstone 2
71 'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'incremental'=>$incremental,
79 'incremental_mode'=>$incremental_mode,
80 'remove_empty_classifications'=>$remove_empty_classifications,
81 'outhandle'=>$outhandle,
82 'no_text'=>$no_text,
83 'failhandle'=>$failhandle,
84 'notbuilt'=>{}, # indexes not built
85 'gli'=>$gli
86 }, $class;
87
88 $self->{'gli'} = 0 unless defined $self->{'gli'};
89
90 # Read in the collection configuration file.
91 if ((defined $site) && ($site ne "")) { # GS3
92 $gs_mode = "gs3";
93 }
94
95 my $colcfgname = &colcfg::get_collect_cfg_name($outhandle, $gs_mode);
96 $self->{'colcfgname'} = $colcfgname;
97 $self->{'collect_cfg'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
98
99 if ($gs_mode eq "gs3") {
100 # read it in again to save the original form for later writing out
101 # of buildConfig.xml
102 # we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
103 $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg ($colcfgname, $gs_mode);
104 }
105
106 # get the database type for this collection from the collect.cfg file (may be undefined)
107 $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
108
109
110 # load up any dontdb fields
111 $self->{'dontdb'} = {};
112 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
113 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
114 $self->{'dontdb'}->{$dg} = 1;
115 }
116 }
117
118 $self->{'maxnumeric'} = 4;
119 return $self;
120}
121
122# stuff has been moved here from new, so we can use subclass methods
123sub init {
124 my $self = shift(@_);
125
126 my $outhandle = $self->{'outhandle'};
127 my $failhandle = $self->{'failhandle'};
128
129 $self->generate_index_list();
130 my $indexes = $self->{'collect_cfg'}->{'indexes'};
131 if (defined $indexes) {
132 # sort out subcollection indexes
133 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
134 $self->{'collect_cfg'}->{'indexes'} = [];
135 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
136 foreach my $index (@$indexes) {
137 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
138 }
139 }
140 }
141
142 # sort out language subindexes
143 if (defined $self->{'collect_cfg'}->{'languages'}) {
144 $indexes = $self->{'collect_cfg'}->{'indexes'};
145 $self->{'collect_cfg'}->{'indexes'} = [];
146 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
147 foreach my $index (@$indexes) {
148 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
149 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
150 }
151 else { # add in an empty subcollection field
152 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
153 }
154 }
155 }
156 }
157 }
158
159 if (defined($self->{'collect_cfg'}->{'indexes'})) {
160 # make sure that the same index isn't specified more than once
161 my %tmphash = ();
162 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
163 $self->{'collect_cfg'}->{'indexes'} = [];
164 foreach my $i (@tmparray) {
165 if (!defined ($tmphash{$i})) {
166 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
167 $tmphash{$i} = 1;
168 }
169 }
170 } else {
171 $self->{'collect_cfg'}->{'indexes'} = [];
172 }
173
174
175 # Prepare to work with the <collection>/etc/oai-inf.<db> that keeps track
176 # of the OAI identifiers with their time stamps and deleted status.
177 #
178 # At this stage of working with the oai info db, we don't care whether we have a
179 # manifest or are otherwise incremental, or whether we're doing removeold (full rebuild).
180 # Because we've already dealt with that during the import stage. From here on, we pretend
181 # we're incremental, since the oai info db should just do what archiveinfo contains.
182 # This is because "building is always incremental" where oai info db is concerned.
183
184 my $archivedir = $self->{'source_dir'};
185 my $oai_info = new oaiinfo($self->{'colcfgname'}, $self->{'collect_cfg'}->{'infodbtype'}, $self->{'verbosity'});
186 $oai_info->building_stage_before_indexing($archivedir);
187
188
189 # check incremental against whether builder can cope or not.
190 if ($self->{'incremental'} && !$self->is_incremental_capable()) {
191 print $outhandle "WARNING: The indexer used is not capable of incremental building. Reverting to -removeold\n";
192 $self->{'keepold'} = 0;
193 $self->{'incremental'} = 0;
194 $self->{'incremental_mode'} = "none";
195
196 }
197
198 # gs_version for plugins
199 my $gs_version = "2";
200 if ($gs_mode eq "gs3") {
201 $gs_version = "3";
202 }
203 # get the list of plugins for this collection
204 my $plugins = [];
205 if (defined $self->{'collect_cfg'}->{'plugin'}) {
206 $plugins = $self->{'collect_cfg'}->{'plugin'};
207 }
208
209 # load all the plugins
210
211 #build up the extra global options for the plugins
212 my @global_opts = ();
213 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
214 push @global_opts, "-separate_cjk";
215 }
216 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $self->{'verbosity'}, $outhandle, $failhandle, \@global_opts, $self->{'incremental_mode'}, $gs_version, $self->{'site'});
217
218 if (scalar(@{$self->{'pluginfo'}}) == 0) {
219 print $outhandle "No plugins were loaded.\n";
220 die "\n";
221 }
222
223 # get the list of classifiers for this collection
224 my $classifiers = [];
225 if (defined $self->{'collect_cfg'}->{'classify'}) {
226 $classifiers = $self->{'collect_cfg'}->{'classify'};
227 }
228
229 # load all the classifiers
230 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $self->{'build_dir'}, $outhandle);
231
232 # load up the document processor for building
233 # if a buildproc class has been created for this collection, use it
234 # otherwise, use the default buildproc for the builder we are initialising
235 my $buildprocdir = undef;
236 my $buildproctype;
237
238 my $collection = $self->{'collection'};
239 if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
240 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
241 $buildproctype = "custombuildproc";
242 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
243 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
244 $buildproctype = "custombuildproc";
245 } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
246 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
247 $buildproctype = "${collection}buildproc";
248 } else {
249 $buildproctype = $self->default_buildproc();
250 }
251 if (defined $buildprocdir) {
252 require "$buildprocdir/$buildproctype.pm";
253 }
254 else {
255 require "$buildproctype.pm";
256 }
257
258 eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
259 "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
260 die "$@" if $@;
261
262 # We call set_infodbtype() now so the buildproc knows the infodbtype for all phases of the build
263 $self->{'buildproc'}->set_infodbtype($self->{'infodbtype'});
264
265 $self->generate_index_options();
266
267 if (!$self->{'debug'} && !$self->{'keepold'}) {
268 # remove any old builds
269 &FileUtils::removeFilesRecursive($self->{'build_dir'});
270 &FileUtils::makeAllDirectories($self->{'build_dir'});
271
272 # make the text directory
273 my $textdir = "$self->{'build_dir'}/text";
274 &FileUtils::makeAllDirectories($textdir);
275 }
276
277 if ($self->{'incremental'}) {
278 # some classes may need to do some additional initialisation
279 $self->init_for_incremental_build();
280 }
281
282}
283
284sub is_incremental_capable
285{
286 # By default we return 'no' as the answer
287 # Safer to assume non-incremental to start with, and then override in
288 # inherited classes that are.
289
290 return 0;
291}
292
293# implement this in subclass if want to do additional initialisation for an
294# incremental build
295sub init_for_incremental_build {
296 my $self = shift (@_);
297}
298
299sub deinit {
300 my $self = shift (@_);
301
302 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
303}
304
305sub generate_index_options {
306 my $self = shift (@_);
307
308 my $separate_cjk = 0;
309
310 my $indexoptions = $self->{'collect_cfg'}->{'indexoptions'};
311 if (defined($indexoptions)) {
312
313 foreach my $option (@$indexoptions) {
314 if ($option =~ /separate_cjk/) {
315 $separate_cjk = 1;
316 }
317 }
318 }
319 # set this for building
320 $self->{'buildproc'}->set_separate_cjk($separate_cjk);
321 # record it for build.cfg
322 $self->{'separate_cjk'} = $separate_cjk;
323}
324
325sub set_sections_index_document_metadata {
326 my $self = shift (@_);
327 my ($index) = @_;
328
329 $self->{'buildproc'}->set_sections_index_document_metadata($index);
330}
331
332sub set_maxnumeric {
333 my $self = shift (@_);
334 my ($maxnumeric) = @_;
335
336 $self->{'maxnumeric'} = $maxnumeric;
337}
338sub set_strip_html {
339 my $self = shift (@_);
340 my ($strip) = @_;
341
342 $self->{'strip_html'} = $strip;
343 $self->{'buildproc'}->set_strip_html($strip);
344}
345
346sub set_store_metadata_coverage {
347 my $self = shift (@_);
348 my ($store_metadata_coverage) = @_;
349
350 $self->{'buildproc'}->set_store_metadata_coverage($store_metadata_coverage);
351}
352
353sub compress_text {
354 my $self = shift (@_);
355 my ($textindex) = @_;
356
357 print STDERR "compress_text() should be implemented in subclass!!";
358 return;
359}
360
361
362sub build_indexes {
363 my $self = shift (@_);
364 my ($indexname) = @_;
365 my $outhandle = $self->{'outhandle'};
366
367 $self->pre_build_indexes();
368
369 my $indexes = [];
370 if (defined $indexname && $indexname =~ /\w/) {
371 push @$indexes, $indexname;
372 } else {
373 $indexes = $self->{'collect_cfg'}->{'indexes'};
374 }
375
376 # create the mapping between the index descriptions
377 # and their directory names (includes subcolls and langs)
378 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
379
380 # build each of the indexes
381 foreach my $index (@$indexes) {
382 if ($self->want_built($index)) {
383 print $outhandle "\n*** building index $index in subdirectory " .
384 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
385 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
386 $self->build_index($index);
387 } else {
388 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
389 }
390 }
391
392 $self->post_build_indexes();
393
394}
395
396# implement this in subclass if want to do extra stuff at before building
397# all the indexes
398sub pre_build_indexes {
399 my $self = shift(@_);
400 my ($indexname) = @_; # optional parameter
401}
402
403# implement this in subclass if want to do extra stuff at the end of building
404# all the indexes
405sub post_build_indexes {
406 my $self = shift(@_);
407}
408
409sub build_index {
410 my $self = shift (@_);
411 my ($index) = @_;
412
413 print STDERR "build_index should be implemented in subclass\n";
414 return;
415}
416
417# By default, builders do support make_infodatabase()
418sub supports_make_infodatabase {
419 return 1;
420}
421
422
423sub make_infodatabase {
424 my $self = shift (@_);
425 my $outhandle = $self->{'outhandle'};
426
427 print STDERR "BuildDir: $self->{'build_dir'}\n";
428
429 my $textdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
430 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
431 &FileUtils::makeAllDirectories ($textdir);
432 &FileUtils::makeAllDirectories ($assocdir);
433
434 # Get info database file path
435 my $infodb_type = $self->{'infodbtype'};
436 my $infodb_file_path = &dbutil::get_infodb_file_path($infodb_type, $self->{'collection'}, $textdir);
437
438 print $outhandle "\n*** creating the info database and processing associated files\n"
439 if ($self->{'verbosity'} >= 1);
440 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
441
442 # init all the classifiers
443 &classify::init_classifiers ($self->{'classifiers'});
444
445 my $reconstructed_docs = undef;
446 my $database_recs = undef;
447
448 if ($self->{'incremental'}) {
449 $database_recs = {};
450
451 &dbutil::read_infodb_file($infodb_type, $infodb_file_path, $database_recs);
452 }
453
454
455 # Important (for memory usage reasons) that we obtain the filehandle
456 # here for writing out to the database, rather than after
457 # $reconstructed_docs has been set up (assuming -incremental is on)
458 #
459 # This is because when we open a pipe to txt2db [using open()]
460 # this triggers a fork() followed by exec(). $reconstructed_docs
461 # can get very large, and so if we did the open() after this, it means
462 # the fork creates a clone of the *large* process image which (admittedly)
463 # is then quickly replaced in the execve() with the much smaller image for
464 # 'txt2db'. The trouble is, in that seismic second caused by
465 # the fork(), the system really does need to have all that memory available
466 # even though it isn't ultimately used. The result is an out of memory
467 # error.
468
469 my ($infodb_handle);
470 if ($self->{'debug'}) {
471 $infodb_handle = *STDOUT;
472 }
473 else {
474 $infodb_handle = &dbutil::open_infodb_write_handle($infodb_type, $infodb_file_path);
475 if (!defined($infodb_handle))
476 {
477 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
478 die "builder::make_infodatabase - couldn't open infodb write handle\n";
479 }
480 }
481
482 if ($self->{'incremental'}) {
483 # reconstruct doc_obj metadata from database for all docs
484 $reconstructed_docs
485 = &classify::reconstruct_doc_objs_metadata($infodb_type,
486 $infodb_file_path,
487 $database_recs);
488 }
489
490 # set up the document processor
491
492 $self->{'buildproc'}->set_output_handle ($infodb_handle);
493 $self->{'buildproc'}->set_mode ('infodb');
494 $self->{'buildproc'}->set_assocdir ($assocdir);
495 $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
496 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
497 $self->{'buildproc'}->set_indexing_text (0);
498 $self->{'buildproc'}->set_store_text(1);
499
500 # make_infodatabase needs full reset even for incremental build
501 # as incremental works by reconstructing all docs from the database and
502 # then adding in the new ones
503 $self->{'buildproc'}->zero_reset();
504
505 $self->{'buildproc'}->{'mdprefix_fields'} = {};
506
507 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
508 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
509
510 if ($self->{'incremental'}) {
511 # create flat classify structure, ready for new docs to be added
512 foreach my $doc_obj ( @$reconstructed_docs ) {
513 if (! defined $self->{'buildproc'}->{'dont_process_reconstructed'}->{$doc_obj->get_OID()}) {
514 print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
515 $self->{'buildproc'}->process($doc_obj,undef);
516 }
517 }
518 }
519 # this has changed to only output collection meta if its
520 # not in the config file
521 $self->output_collection_meta($infodb_handle);
522
523 # output classification information
524 &classify::output_classify_info ($self->{'classifiers'}, $infodb_type, $infodb_handle,
525 $self->{'remove_empty_classifications'},
526 $self->{'gli'});
527
528 # Output classifier reverse lookup, used in incremental deletion
529 ####&classify::print_reverse_lookup($infodb_handle);
530
531 # output doclist
532 my @doc_list = $self->{'buildproc'}->get_doc_list();
533 my $browselist_infodb = { 'hastxt' => [ "0" ],
534 'childtype' => [ "VList" ],
535 'numleafdocs' => [ scalar(@doc_list) ],
536 'thistype' => [ "Invisible" ],
537 'contains' => [ join(";", @doc_list) ] };
538 &dbutil::write_infodb_entry($infodb_type, $infodb_handle, "browselist", $browselist_infodb);
539
540 &dbutil::close_infodb_write_handle($infodb_type, $infodb_handle) if !$self->{'debug'};
541
542 if ($infodb_type eq "gdbm-txtgz") {
543 my $gdb_infodb_file_path = &dbutil::get_infodb_file_path("gdbm", $self->{'collection'}, $textdir);
544 if (-e $gdb_infodb_file_path) {
545 &FileUtils::removeFiles($gdb_infodb_file_path);
546 }
547 }
548 print STDERR "</Stage>\n" if $self->{'gli'};
549}
550
551sub make_auxiliary_files {
552 my $self = shift (@_);
553 my ($index);
554 my $build_cfg = {};
555 # subclasses may have already defined stuff in here
556 if (defined $self->{'build_cfg'}) {
557 $build_cfg = $self->{'build_cfg'};
558 }
559
560 my $outhandle = $self->{'outhandle'};
561
562 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
563 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
564
565 # get the text directory
566 &FileUtils::makeAllDirectories ($self->{'build_dir'});
567
568 # store the build date
569 $build_cfg->{'builddate'} = time;
570 $build_cfg->{'buildtype'} = $self->{'buildtype'};
571 $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
572 $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
573 if ($self->{'separate_cjk'}) {
574 $build_cfg->{'separate_cjk'} = "true";
575 }
576
577 # store the number of documents and number of bytes
578 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
579 $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
580 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
581
582 # store the mapping between the index names and the directory names
583 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
584 my @indexmap = ();
585 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
586 if (not defined ($self->{'notbuilt'}->{$index})) {
587 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
588 }
589 }
590
591 # store the number of indexes built to later determine whether search serviceracks get written out to buildConfig.xml
592 $build_cfg->{'num_indexes'} = scalar (@indexmap);
593
594 $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
595
596 my @subcollectionmap = ();
597 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
598 push (@subcollectionmap, "$subcollection\-\>" .
599 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
600 }
601 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
602
603 my @languagemap = ();
604 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
605 push (@languagemap, "$language\-\>" .
606 $self->{'index_mapping'}->{'languagemap'}->{$language});
607 }
608 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
609
610 my @notbuilt = ();
611 foreach my $nb (keys %{$self->{'notbuilt'}}) {
612 push (@notbuilt, $nb);
613 }
614 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
615
616 $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
617
618 $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
619
620 # write out the earliestDatestamp information needed for OAI
621 my $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "archives");
622 if(!-d $archivedir) {
623 $archivedir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "export");
624 }
625 my $earliestDatestampFile = &FileUtils::filenameConcatenate($archivedir, "earliestDatestamp");
626 my $earliestDatestamp = 0;
627 if (open(FIN,"<$earliestDatestampFile")) {
628 {
629 # slurp in file as a single line
630 local $/ = undef;
631 $earliestDatestamp = <FIN>;
632 #&unicode::ensure_utf8(\$earliestDatestamp); # turn any high bytes that aren't valid utf-8 into utf-8.
633 }
634 close(FIN);
635 }
636 else {
637 print $outhandle "Warning: unable to read collection's earliestDatestamp from $earliestDatestampFile.\n";
638 print $outhandle "Setting value to 0.\n";
639 }
640 $build_cfg->{'earliestdatestamp'} = $earliestDatestamp;
641
642 $self->build_cfg_extra($build_cfg);
643
644 if ($gs_mode eq "gs2") {
645 &colcfg::write_build_cfg(&FileUtils::filenameConcatenate($self->{'build_dir'},"build.cfg"), $build_cfg);
646 }
647 if ($gs_mode eq "gs3") {
648
649 &colcfg::write_build_cfg_xml(&FileUtils::filenameConcatenate($self->{'build_dir'}, "buildConfig.xml"), $build_cfg, $self->{'collect_cfg_preserve'});
650 }
651
652 print STDERR "</Stage>\n" if $self->{'gli'};
653}
654
655# implement this in subclass if want to add extra stuff to build.cfg
656sub build_cfg_extra {
657 my $self = shift(@_);
658 my ($build_cfg) = @_;
659
660}
661
662
663sub collect_specific {
664 my $self = shift (@_);
665}
666
667sub want_built {
668 my $self = shift (@_);
669 my ($index) = @_;
670
671 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
672 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
673 if ($index =~ /^$checkstr$/) {
674 $self->{'notbuilt'}->{$index} = 1;
675 return 0;
676 }
677 }
678 }
679
680 return 1;
681}
682
683sub create_index_mapping {
684 my $self = shift (@_);
685 my ($indexes) = @_;
686
687 print STDERR "create_index_mapping should be implemented in subclass\n";
688 my %mapping = ();
689 return \%mapping;
690}
691
692# returns a processed version of a field.
693# if the field has only one component the processed
694# version will contain the first character and next consonant
695# of that componant - otherwise it will contain the first
696# character of the first two components
697# only uses letdig (\w) characters now
698sub process_field {
699 my $self = shift (@_);
700 my ($field) = @_;
701
702 return "" unless (defined ($field) && $field =~ /\S/);
703
704 my ($a, $b);
705 my @components = split /,/, $field;
706 if (scalar @components >= 2) {
707 # pick the first letdig from the first two field names
708 ($a) = $components[0] =~ /^[^\w]*(\w)/;
709 ($b) = $components[1] =~ /^[^\w]*(\w)/;
710 } else {
711 # pick the first two letdig chars
712 ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
713 }
714 # there may not have been any letdigs...
715 $a = 'a' unless defined $a;
716 $b = '0' unless defined $b;
717
718 my $newfield = "$a$b";
719 if ($newfield =~ /^\d\d$/) {
720 # digits only - Greenstone runtime doesn't like this.
721 $newfield = "a$a";
722 }
723 return $newfield;
724
725}
726
727sub get_next_version {
728 my $self = shift (@_);
729 my ($nameref) = @_;
730 my $num=0;
731 if ($$nameref =~ /(\d\d)$/) {
732 $num = $1; $num ++;
733 $$nameref =~ s/\d\d$/$num/;
734 } elsif ($$nameref =~ /(\d)$/) {
735 $num = $1;
736 if ($num == 9) {$$nameref =~ s/\d$/10/;}
737 else {$num ++; $$nameref =~ s/\d$/$num/;}
738 } else {
739 $$nameref =~ s/.$/0/;
740 }
741}
742
743
744
745sub get_collection_meta_sets
746{
747 my $self = shift(@_);
748 my $collection_infodb = shift(@_);
749
750 my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
751 foreach my $prefix (keys %$mdprefix_fields)
752 {
753 push(@{$collection_infodb->{"metadataset"}}, $prefix);
754
755 foreach my $field (keys %{$mdprefix_fields->{$prefix}})
756 {
757 push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
758
759 my $val = $mdprefix_fields->{$prefix}->{$field};
760 push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
761 }
762 }
763}
764
765
766# default is to output the metadata sets (prefixes) used in collection
767sub output_collection_meta
768{
769 my $self = shift(@_);
770 my $infodb_handle = shift(@_);
771
772 my %collection_infodb = ();
773 $self->get_collection_meta_sets(\%collection_infodb);
774 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
775}
776
777# sometimes we need to read in an existing build.cfg - for example,
778# if doing each stage of building separately, or when doing incremental
779# building
780sub read_build_cfg {
781 my $self = shift(@_);
782
783 my $buildconfigfilename;
784
785 if ($gs_mode eq "gs2") {
786 $buildconfigfilename = "build.cfg";
787 } else {
788 $buildconfigfilename = "buildConfig.xml";
789 }
790
791 my $buildconfigfile = &FileUtils::filenameConcatenate($self->{'build_dir'}, $buildconfigfilename);
792
793 if (!-e $buildconfigfile) {
794 # try the index dir - but do we know where it is?? try here
795 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "index", $buildconfigfilename);
796 if (!-e $buildconfigfile) {
797 #we cant find a config file - just ignore the field list
798 return undef;
799 }
800 }
801 return &colcfg::read_building_cfg( $buildconfigfile, $gs_mode);
802
803}
804
805sub print_stats {
806 my $self = shift (@_);
807
808 my $outhandle = $self->{'outhandle'};
809 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
810 my $index = $self->{'buildproc'}->get_index();
811 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
812 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
813
814 if ($indexing_text) {
815 print $outhandle "Stats (Creating index $index)\n";
816 } else {
817 print $outhandle "Stats (Compressing text from $index)\n";
818 }
819 print $outhandle "Total bytes in collection: $num_bytes\n";
820 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
821
822 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
823
824 if ($self->{'incremental'}) {
825 if ($num_processed_bytes == 0) {
826 if ($indexing_text) {
827 print $outhandle "No additional text was added to $index\n";
828 } elsif (!$self->{'no_text'}) {
829 print $outhandle "No additional text was compressed\n";
830 }
831 }
832 }
833 else {
834 print $outhandle "***************\n";
835 if ($indexing_text) {
836 print $outhandle "WARNING: There is very little or no text to process for $index\n";
837 } elsif (!$self->{'no_text'}) {
838 print $outhandle "WARNING: There is very little or no text to compress\n";
839 }
840 print $outhandle " Was this your intention?\n";
841 print $outhandle "***************\n";
842 }
843
844 }
845
846}
847
848
8491;
850
Note: See TracBrowser for help on using the repository browser.