root/gsdl/trunk/perllib/basebuilder.pm @ 17110

Revision 17110, 21.1 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
RevLine 
[14930]1###########################################################################
2#
3# basebuilder.pm -- base class for collection builders
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package basebuilder;
27
28use strict;
29no strict 'refs'; # allow filehandles to be variables and viceversa
30
31use classify;
32use cfgread;
33use colcfg;
[15709]34use dbutil;
[14930]35use plugin;
36use util;
37
[15709]38
[14930]39BEGIN {
40    # set autoflush on for STDERR and STDOUT so that mgpp
41    # doesn't get out of sync with plugins
42    STDOUT->autoflush(1);
43    STDERR->autoflush(1);
44}
45
46END {
47    STDOUT->autoflush(0);
48    STDERR->autoflush(0);
49}
50
51our $maxdocsize = 12000;
52
53# used to signify "gs2"(default) or "gs3"
54my $gs_mode = "gs2";
55
56sub new {
57    my ($class, $collection, $source_dir, $build_dir, $verbosity,
[16259]58    $maxdocs, $debug, $keepold, $incremental,
[14930]59    $remove_empty_classifications,
60    $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_;
61
62    $outhandle = *STDERR unless defined $outhandle;
63    $no_text = 0 unless defined $no_text;
64    $failhandle = *STDERR unless defined $failhandle;
65
66    # create a builder object
67    my $self = bless {'collection'=>$collection,
68              'source_dir'=>$source_dir,
69              'build_dir'=>$build_dir,
70              'verbosity'=>$verbosity,
71              'maxdocs'=>$maxdocs,
72              'debug'=>$debug,
73              'keepold'=>$keepold,
74              'incremental'=>$incremental,
75              'remove_empty_classifications'=>$remove_empty_classifications,
76              'outhandle'=>$outhandle,
77              'no_text'=>$no_text,
78              'failhandle'=>$failhandle,
79              'notbuilt'=>{},    # indexes not built
80              'gli'=>$gli,
81              'disable_OAI'=>$disable_OAI
82              }, $class;
83
84    $self->{'gli'} = 0 unless defined $self->{'gli'};
85   
86    # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml
87    $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'};
88
89    # Read in the collection configuration file.
90    my ($colcfgname);
91    ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle);
92    if ($gs_mode eq "gs2") {
93        $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94    } elsif ($gs_mode eq "gs3") {
[14384]95    $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname);
96
[14668]97    #this $self->{'collect_cfg_preserve'} is used for gs3 only and to be passed to &colcfg::write_build_cfg_xml in sub make_auxilary_files later in this basebuilder.pm, we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers.
98    $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg_xml ($colcfgname);
[14930]99    }
[15725]100
101    # get the database type for this collection from the collect.cfg file (may be undefined)
[15727]102    $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || &dbutil::get_default_infodb_type();
[15725]103
[14930]104    # get the list of plugins for this collection
105    my $plugins = [];
106    if (defined $self->{'collect_cfg'}->{'plugin'}) {
107    $plugins = $self->{'collect_cfg'}->{'plugin'};
108    }
109   
110    # load all the plugins
[14384]111
[14930]112    #build up the extra global options for the plugins
113    my @global_opts = ();
114    if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
115    push @global_opts, "-separate_cjk";
[14384]116    }
[14930]117    $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold);
118   
119    if (scalar(@{$self->{'pluginfo'}}) == 0) {
120    print $outhandle "No plugins were loaded.\n";
121    die "\n";
122    }
123
124    # get the list of classifiers for this collection
125    my $classifiers = [];
126    if (defined $self->{'collect_cfg'}->{'classify'}) {
127    $classifiers = $self->{'collect_cfg'}->{'classify'};
128    }
129
130    # load all the classifiers
[14384]131    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
[14930]132
[15688]133    # load up any dontdb fields
134    $self->{'dontdb'} = {};
[14930]135    if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
136    foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
[15688]137        $self->{'dontdb'}->{$dg} = 1;
[14930]138    }
139    }
140
141    $self->{'maxnumeric'} = 4;
142    return $self;
143}
144
145# stuff has been moved here from new, so we can use subclass methods
146sub init {
147    my $self = shift(@_);
148   
149    $self->generate_index_list();
[17110]150 
[14930]151    # sort out subcollection indexes
152    if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
153    my $indexes = $self->{'collect_cfg'}->{'indexes'};
154    $self->{'collect_cfg'}->{'indexes'} = [];
155    foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
156        foreach my $index (@$indexes) {
157        push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
158        }
159    }
160    }
161
162    # sort out language subindexes
163    if (defined $self->{'collect_cfg'}->{'languages'}) {
164    my $indexes = $self->{'collect_cfg'}->{'indexes'};
165    $self->{'collect_cfg'}->{'indexes'} = [];
166    foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
167        foreach my $index (@$indexes) {
168        if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
169            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
170        }
171        else { # add in an empty subcollection field
172            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
173        }
174        }
175    }
176    }
177
178    if (defined($self->{'collect_cfg'}->{'indexes'})) {
179    # make sure that the same index isn't specified more than once
180    my %tmphash = ();
181    my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
182    $self->{'collect_cfg'}->{'indexes'} = [];
183    foreach my $i (@tmparray) {
184        if (!defined ($tmphash{$i})) {
185        push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
186        $tmphash{$i} = 1;
187        }
188    }
189    } else {
190    $self->{'collect_cfg'}->{'indexes'} = [];
191    }
192
193    # load up the document processor for building
194    # if a buildproc class has been created for this collection, use it
195    # otherwise, use the mg buildproc
196    my ($buildprocdir, $buildproctype);
197    my $collection = $self->{'collection'};
198    if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") {
199    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib";
200    $buildproctype = "custombuildproc";
201    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") {
202    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
203    $buildproctype = "custombuildproc";
204    } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
205    $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
206    $buildproctype = "${collection}buildproc";
207    } else {
208    $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
209    $buildproctype = $self->default_buildproc();
210    }
211    require "$buildprocdir/$buildproctype.pm";
212
213    eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " .
214     "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})");
215    die "$@" if $@;
216
[17110]217   
218   $self->generate_index_options();
219
[14930]220    if (!$self->{'debug'} && !$self->{'keepold'}) {
221    # remove any old builds
222    &util::rm_r($self->{'build_dir'});
223    &util::mk_all_dir($self->{'build_dir'});
224       
225    # make the text directory
226    my $textdir = "$self->{'build_dir'}/text";
227    &util::mk_all_dir($textdir);
228    }
229   
230}
231
232sub deinit {
233    my $self = shift (@_);
234   
235    &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
236}
237
[17110]238sub generate_index_options {
239    my $self = shift (@_);
240
241    my $separate_cjk = 0;
242   
243    if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
244    foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
245        if ($option =~ /separate_cjk/) {
246        $separate_cjk = 1;
247        }
248    }
249    }
250    # set this for building
251    $self->{'buildproc'}->set_separate_cjk($separate_cjk);
252    # record it for build.cfg
253    $self->{'separate_cjk'} = $separate_cjk;
254}
255 
[14930]256sub set_sections_index_document_metadata {
257    my $self = shift (@_);
258    my ($index) = @_;
259 
260    $self->{'buildproc'}->set_sections_index_document_metadata($index);
261}
262
263sub set_maxnumeric {
264    my $self = shift (@_);
265    my ($maxnumeric) = @_;
266
267    $self->{'maxnumeric'} = $maxnumeric;
268}
269sub set_strip_html {
270    my $self = shift (@_);
271    my ($strip) = @_;
272   
273    $self->{'strip_html'} = $strip;
274    $self->{'buildproc'}->set_strip_html($strip);
275}
276
277sub compress_text {
278    my $self = shift (@_);
279    my ($textindex) = @_;
280
281    print STDERR "compress_text() should be implemented in subclass!!";
282    return;
283}
284
285
286sub build_indexes {
287    my $self = shift (@_);
288    my ($indexname) = @_;
289    my $outhandle = $self->{'outhandle'};
290
291    my $indexes = [];
292    if (defined $indexname && $indexname =~ /\w/) {
293    push @$indexes, $indexname;
294    } else {
295    $indexes = $self->{'collect_cfg'}->{'indexes'};
296    }
297
298    # create the mapping between the index descriptions
299    # and their directory names (includes subcolls and langs)
300    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
301   
302    # build each of the indexes
303    foreach my $index (@$indexes) {
304    if ($self->want_built($index)) {
305        print $outhandle "\n*** building index $index in subdirectory " .
306        "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
307        print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
308        $self->build_index($index);
309    } else {
310        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
311    }
312    }
313
314    $self->build_indexes_extra();
315
316}
317
318sub build_indexes_extra {
319    my $self = shift(@_);
320   
321}
322
323sub build_index {
324    my $self = shift (@_);
325    my ($index) = @_;
326   
327    print STDERR "build_index should be implemented in subclass\n";
328    return;
329}
330
331
332
333sub make_infodatabase {
334    my $self = shift (@_);
335    my $outhandle = $self->{'outhandle'};
336
337    print STDERR "BuildDir: $self->{'build_dir'}\n";
338
339    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
340    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
341    &util::mk_all_dir ($textdir);
342    &util::mk_all_dir ($assocdir);
343
[15710]344    # Get info database file path
[15725]345    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $textdir);
[14930]346
347    print $outhandle "\n*** creating the info database and processing associated files\n"
348    if ($self->{'verbosity'} >= 1);
349    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
350
351    # init all the classifiers
352    &classify::init_classifiers ($self->{'classifiers'});
353
354    my $reconstructed_docs = undef;
355    if ($self->{'keepold'}) {
[15688]356    # reconstruct doc_obj metadata from database for all docs
[15725]357    $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($self->{'infodbtype'}, $infodb_file_path);
[14930]358    }
359   
360    # set up the document processor
[15700]361    my ($infodb_handle);
[14930]362    if ($self->{'debug'}) {
[15700]363    $infodb_handle = *STDOUT;
[15710]364    }
365    else {
[15725]366    $infodb_handle = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $infodb_file_path);
[15711]367    if (!defined($infodb_handle))
368    {
[14930]369        print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
[15711]370        die "builder::make_infodatabase - couldn't open infodb write handle\n";
[14930]371    }
372    }
[15725]373
374    $self->{'buildproc'}->set_infodbtype ($self->{'infodbtype'});
[15700]375    $self->{'buildproc'}->set_output_handle ($infodb_handle);
[14930]376    $self->{'buildproc'}->set_mode ('infodb');
377    $self->{'buildproc'}->set_assocdir ($assocdir);
[15688]378    $self->{'buildproc'}->set_dontdb ($self->{'dontdb'});
[14930]379    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
380    $self->{'buildproc'}->set_indexing_text (0);
381    $self->{'buildproc'}->set_store_text(1);
[16222]382    $self->{'buildproc'}->set_store_metadata_coverage ($self->{'collect_cfg'}->{'store_metadata_coverage'});
[14930]383
384    # make_infodatabase needs full reset even for incremental build
[15688]385    # as incremental works by reconstructing all docs from the database and
[14930]386    # then adding in the new ones
387    $self->{'buildproc'}->zero_reset();
388
[14934]389    $self->{'buildproc'}->{'mdprefix_fields'} = {};
390
[14930]391    if ($self->{'keepold'}) {
392    # create flat classify structure, ready for new docs to be added
393    foreach my $doc_obj ( @$reconstructed_docs ) {     
394        print $outhandle "  Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n";
395        $self->{'buildproc'}->process($doc_obj,undef);
396    }
397    }
398
399   
[14934]400    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]401           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[14934]402
[14930]403    # this has changed to only output collection meta if its
404    # not in the config file
[15700]405    $self->output_collection_meta($infodb_handle);
[14930]406   
407    # output classification information
[15725]408    &classify::output_classify_info ($self->{'classifiers'}, $self->{'infodbtype'}, $infodb_handle,
[14930]409                     $self->{'remove_empty_classifications'},
410                     $self->{'gli'});
411
412    # Output classifier reverse lookup, used in incremental deletion
[15700]413    #&classify::print_reverse_lookup($infodb_handle);
[14930]414
[15700]415    # output doclist
416    my @doc_list = $self->{'buildproc'}->get_doc_list();
[15725]417    my $browselist_infodb = { 'hastxt' => [ "0" ],
418                  'childtype' => [ "VList" ],
419                  'numleafdocs' => [ scalar(@doc_list) ],
420                  'thistype' => [ "Invisible" ],
421                  'contains' => [ join(";", @doc_list) ] };
422    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "browselist", $browselist_infodb);
[14930]423
[16176]424    &dbutil::close_infodb_write_handle($self->{'infodbtype'}, $infodb_handle) if !$self->{'debug'};
[14930]425
426    print STDERR "</Stage>\n" if $self->{'gli'};
427}
428
429sub make_auxiliary_files {
430    my $self = shift (@_);
431    my ($index);
432    my $build_cfg = {};
433    # subclasses may have already defined stuff in here
434    if (defined $self->{'build_cfg'}) {
435    $build_cfg = $self->{'build_cfg'};
436    }
437
438    my $outhandle = $self->{'outhandle'};
439
440    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
441    print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
442
443    # get the text directory
444    &util::mk_all_dir ($self->{'build_dir'});
445
446    # store the build date
447    $build_cfg->{'builddate'} = time;
448    $build_cfg->{'buildtype'} = $self->{'buildtype'};
[15003]449    $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'});
[14930]450    $build_cfg->{'stemindexes'} = $self->{'stemindexes'};
[17110]451    if ($self->{'separate_cjk'}) {
452    $build_cfg->{'separate_cjk'} = "true";
453    }
[14930]454   
455    # store the number of documents and number of bytes
456    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
457    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
458    $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
459   
460    # store the mapping between the index names and the directory names
461    # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
462    my @indexmap = ();
463    foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
464    if (not defined ($self->{'notbuilt'}->{$index})) {
465        push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
466    }
467    }
468    $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap);
469
470    my @subcollectionmap = ();
471    foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
472    push (@subcollectionmap, "$subcollection\-\>" .
473          $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
474    }
475    $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
476
477    my @languagemap = ();
478    foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
479    push (@languagemap, "$language\-\>" .
480          $self->{'index_mapping'}->{'languagemap'}->{$language});
481    }
482    $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
483
484    my @notbuilt = ();
485    foreach my $nb (keys %{$self->{'notbuilt'}}) {
486    push (@notbuilt, $nb);
487    }
488    $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
489
490    $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'};
491
[15728]492    $build_cfg->{'infodbtype'} = $self->{'infodbtype'};
493
[14930]494    $self->build_cfg_extra($build_cfg);
495
496    if ($gs_mode eq "gs2") {
497      &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg);
498    }
[14384]499    if ($gs_mode eq "gs3") {
[14930]500
501      &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'});
502    }   
503
504    print STDERR "</Stage>\n" if $self->{'gli'};
505}
506
507sub collect_specific {
508    my $self = shift (@_);
509}
510
511sub want_built {
512    my $self = shift (@_);
513    my ($index) = @_;
514
515    if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
516    foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
517        if ($index =~ /^$checkstr$/) {
518        $self->{'notbuilt'}->{$index} = 1;
519        return 0;
520        }
521    }
522    }
523
524    return 1;
525}
526
527sub create_index_mapping {
528    my $self = shift (@_);
529    my ($indexes) = @_;
530
531    print STDERR "create_index_mapping should be implemented in subclass\n";
532    my %mapping = ();
533    return \%mapping;
534}
535
536# returns a processed version of a field.
537# if the field has only one component the processed
538# version will contain the first character and next consonant
539# of that componant - otherwise it will contain the first
540# character of the first two components
541# only uses letdig (\w) characters now
542sub process_field {
543    my $self = shift (@_);
544    my ($field) = @_;
545
546    return "" unless (defined ($field) && $field =~ /\S/);
547   
548    my ($a, $b);
549    my @components = split /,/, $field;
550    if (scalar @components >= 2) {
551    # pick the first letdig from the first two field names
552    ($a) = $components[0] =~ /^[^\w]*(\w)/;
553    ($b) = $components[1] =~ /^[^\w]*(\w)/;
554    } else {
555    # pick the first two letdig chars
556    ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i;
557    }
558    # there may not have been any letdigs...
559    $a = 'a' unless defined $a;
560    $b = '0' unless defined $b;
561
562    return "$a$b";
563   
564}
565
566sub get_next_version {
567    my $self = shift (@_);
568    my ($nameref) = @_;
569    my $num=0;
570    if ($$nameref =~ /(\d\d)$/) {
571    $num = $1; $num ++;
572    $$nameref =~ s/\d\d$/$num/;
573    } elsif ($$nameref =~ /(\d)$/) {
574    $num = $1;
575    if ($num == 9) {$$nameref =~ s/\d$/10/;}
576    else {$num ++; $$nameref =~ s/\d$/$num/;}
577    } else {
578    $$nameref =~ s/.$/0/;
579    }
580}
581
582# implement this in subclass if want to add extra stuff to build.cfg
583sub build_cfg_extra {
584   my $self = shift(@_);
585   my ($build_cfg) = @_;
586   
587}
588
[14934]589
[15709]590sub get_collection_meta_sets
591{
[14930]592    my $self = shift(@_);
[15709]593    my $collection_infodb = shift(@_);
[14930]594
[14934]595    my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'};
596    foreach my $prefix (keys %$mdprefix_fields)
597    {
[15709]598    push(@{$collection_infodb->{"metadataset"}}, $prefix);
[14934]599
600    foreach my $field (keys %{$mdprefix_fields->{$prefix}})
601    {
[15709]602        push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field);
[14934]603
[15709]604        my $val = $mdprefix_fields->{$prefix}->{$field};
605        push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val);
[14934]606    }
607    }
[15709]608}
[14934]609
610
611# default is to output the metadata sets (prefixes) used in collection
[15709]612sub output_collection_meta
613{
[14934]614    my $self = shift(@_);
[15709]615    my $infodb_handle = shift(@_);
[14934]616
[15709]617    my %collection_infodb = ();
618    $self->get_collection_meta_sets(\%collection_infodb);
[15725]619    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb);
[15709]620}
[14934]621
622
[14930]623sub print_stats {
624    my $self = shift (@_);
625
626    my $outhandle = $self->{'outhandle'};
627    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
628    my $index = $self->{'buildproc'}->get_index();
629    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
630    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
631
632    if ($indexing_text) {
633    print $outhandle "Stats (Creating index $index)\n";
634    } else {
635    print $outhandle "Stats (Compressing text from $index)\n";
636    }
637    print $outhandle "Total bytes in collection: $num_bytes\n";
638    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
639
640    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
641   
642    if ($self->{'keepold'}) {
643        if ($num_processed_bytes == 0) {
644        if ($indexing_text) {
645            print $outhandle "No additional text was added to $index\n";
646        } elsif (!$self->{'no_text'}) {
647            print $outhandle "No additional text was compressed\n";
648        }   
649        }   
650    }
651    else {
652        print $outhandle "***************\n";
653        if ($indexing_text) {
654        print $outhandle "WARNING: There is very little or no text to process for $index\n";
655        } elsif (!$self->{'no_text'}) {
656        print $outhandle "WARNING: There is very little or no text to compress\n";
657        }     
658        print $outhandle "         Was this your intention?\n";
659        print $outhandle "***************\n";
660    }
661
662    }
663
664}
665
666 
6671;
668
Note: See TracBrowser for help on using the browser.