root/gsdl/trunk/perllib/basebuildproc.pm @ 17110

Revision 17110, 18.3 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict; no strict 'subs';
38use util;
39
40BEGIN {
41    @basebuildproc::ISA = ('docproc');
42}
43
44sub new()
45  {
46    my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
47    my $self = new docproc ();
48
49    # outhandle is where all the debugging info goes
50    # output_handle is where the output of the plugins is piped
51    # to (i.e. mg, database etc.)
52    $outhandle = STDERR unless defined $outhandle;
53
54    $self->{'collection'} = $collection;
55    $self->{'source_dir'} = $source_dir;
56    $self->{'build_dir'}  = $build_dir;
57    $self->{'keepold'}    = $keepold;
58    $self->{'verbosity'}  = $verbosity;
59    $self->{'outhandle'}  = $outhandle;
60
61    $self->{'classifiers'} = [];
62    $self->{'mode'} = "text";
63    $self->{'assocdir'} = $build_dir;
64    $self->{'dontdb'} = {};
65    $self->{'store_metadata_coverage'} = "false";
66
67    $self->{'index'} = "section:text";
68    $self->{'indexexparr'} = [];
69
70    $self->{'separate_cjk'} = 0;
71
72    my $found_num_data = 0;
73    my $buildconfigfile = undef;
74
75    if ($keepold) {
76    # For incremental building need to seed num_docs etc from values
77    # stored in build.cfg (if present)
78    $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
79    if (-e $buildconfigfile) {
80        $found_num_data = 1;
81    }
82    else {
83        # try the index dir
84        $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
85                           "index", "build.cfg");
86        if (-e $buildconfigfile) {
87        $found_num_data = 1;
88        }
89    }
90
91    }
92
93    if ($found_num_data)
94      {
95        #print STDERR "Found_Num_Data!\n";
96    my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
97    $self->{'starting_num_docs'}     = $buildcfg->{'numdocs'};
98        #print STDERR "- num_docs:     $self->{'starting_num_docs'}\n";
99    $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
100        #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
101    $self->{'starting_num_bytes'}    = $buildcfg->{'numbytes'};
102        #print STDERR "- num_bytes:    $self->{'starting_num_bytes'}\n";
103    }
104    else
105      {
106        #print STDERR "NOT Found_Num_Data!\n";
107        $self->{'starting_num_docs'}     = 0;
108    $self->{'starting_num_sections'} = 0;
109    $self->{'starting_num_bytes'}    = 0;
110      }
111
112    $self->{'output_handle'} = "STDOUT";
113    $self->{'num_docs'}      = $self->{'starting_num_docs'};
114    $self->{'num_sections'}  = $self->{'starting_num_sections'};
115    $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
116
117    $self->{'num_processed_bytes'} = 0;
118    $self->{'store_text'} = 1;
119
120    # what level (section/document) the database - indexer intersection is
121    $self->{'db_level'} = "section";
122    #used by browse interface
123    $self->{'doclist'} = [];
124
125    $self->{'indexing_text'} = 0;
126
127    return bless $self, $class;
128
129}
130
131sub reset {
132    my $self = shift (@_);
133
134    $self->{'num_docs'}      = $self->{'starting_num_docs'};
135    $self->{'num_sections'}  = $self->{'starting_num_sections'};
136    $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
137   
138    $self->{'num_processed_bytes'} = 0;
139}
140
141sub zero_reset {
142    my $self = shift (@_);
143
144    $self->{'num_docs'}      = 0;
145    $self->{'num_sections'}  = 0;
146    $self->{'num_bytes'}     = 0;
147   
148    $self->{'num_processed_bytes'} = 0;
149}
150
151sub is_incremental_capable
152{
153    # By default we return 'no' as the answer
154    # Safer to assume non-incremental to start with, and then override in
155    # inherited classes that are.
156
157    return 0;
158}
159
160sub get_num_docs {
161    my $self = shift (@_);
162
163    return $self->{'num_docs'};
164}
165
166sub get_num_sections {
167    my $self = shift (@_);
168
169    return $self->{'num_sections'};
170}
171
172# num_bytes is the actual number of bytes in the collection
173# this is normally the same as what's processed during text compression
174sub get_num_bytes {
175    my $self = shift (@_);
176
177    return $self->{'num_bytes'};
178}
179
180# num_processed_bytes is the number of bytes actually passed
181# to mg for the current index
182sub get_num_processed_bytes {
183    my $self = shift (@_);
184
185    return $self->{'num_processed_bytes'};
186}
187
188sub set_output_handle {
189    my $self = shift (@_);
190    my ($handle) = @_;
191
192    $self->{'output_handle'} = $handle;
193}
194
195
196sub set_mode {
197    my $self = shift (@_);
198    my ($mode) = @_;
199
200    $self->{'mode'} = $mode;
201}
202
203sub get_mode {
204    my $self = shift (@_);
205
206    return $self->{'mode'};
207}
208
209sub set_assocdir {
210    my $self = shift (@_);
211    my ($assocdir) = @_;
212
213    $self->{'assocdir'} = $assocdir;
214}
215
216sub set_dontdb {
217    my $self = shift (@_);
218    my ($dontdb) = @_;
219
220    $self->{'dontdb'} = $dontdb;
221}
222
223sub set_infodbtype
224{
225    my $self = shift(@_);
226    my $infodbtype = shift(@_);
227    $self->{'infodbtype'} = $infodbtype;
228}
229
230sub set_index {
231    my $self = shift (@_);
232    my ($index, $indexexparr) = @_;
233
234    $self->{'index'} = $index;
235    $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
236}
237
238sub set_index_languages {
239    my $self = shift (@_);
240    my ($lang_meta, $langarr) = @_;
241    $self->{'lang_meta'} = $lang_meta;
242    $self->{'langarr'} = $langarr;
243}
244
245sub get_index {
246    my $self = shift (@_);
247
248    return $self->{'index'};
249}
250
251sub set_classifiers {
252    my $self = shift (@_);
253    my ($classifiers) = @_;
254
255    $self->{'classifiers'} = $classifiers;
256}
257
258sub set_indexing_text {
259    my $self = shift (@_);
260    my ($indexing_text) = @_;
261
262    $self->{'indexing_text'} = $indexing_text;
263}
264
265sub get_indexing_text {
266    my $self = shift (@_);
267
268    return $self->{'indexing_text'};
269}
270
271sub set_store_text {
272    my $self = shift (@_);
273    my ($store_text) = @_;
274
275    $self->{'store_text'} = $store_text;
276}
277
278sub set_store_metadata_coverage {
279    my $self = shift (@_);
280    my ($store_metadata_coverage) = @_;
281
282    $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
283}
284
285sub get_doc_list {
286    my $self = shift(@_);
287   
288    return @{$self->{'doclist'}};
289}
290
291# the standard database level is section, but you may want to change it to document
292sub set_db_level {
293    my $self= shift (@_);
294    my ($db_level) = @_;
295
296    $self->{'db_level'} = $db_level;
297}
298
299sub set_sections_index_document_metadata {
300    my $self= shift (@_);
301    my ($index_type) = @_;
302   
303    $self->{'sections_index_document_metadata'} = $index_type;
304}
305
306sub set_separate_cjk {
307    my $self = shift (@_);
308    my ($sep_cjk) = @_;
309
310    $self->{'separate_cjk'} = $sep_cjk;
311}
312
313sub process {
314    my $self = shift (@_);
315    my $method = $self->{'mode'};
316
317    $self->$method(@_);
318}
319
320# post process text depending on field. Currently don't do anything here
321# except cjk separation
322sub filter_text {
323    my $self = shift (@_);
324    my ($field, $text) = @_;
325
326    # lets do cjk seg here
327    my $new_text =$text;
328    if ($self->{'separate_cjk'}) {
329    $new_text = &cnseg::segment($text);
330    }
331    return $new_text;
332}
333
334
335sub infodb_metadata_stats
336{
337    my $self = shift (@_);
338    my ($field) = @_;
339
340    # Keep some statistics relating to metadata sets used and
341    # frequency of particular metadata fields within each set
342
343    # Union of metadata prefixes and frequency of fields
344    # (both scoped for this document alone, and across whole collection)
345   
346    if ($field =~ m/^(.+)\.(.*)$/) {
347    my $prefix = $1;
348    my $core_field = $2;
349
350    $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
351    $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
352    }
353    elsif ($field =~ m/^[[:upper:]]/) {
354    # implicit 'ex' metadata set
355
356    $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
357    $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
358    }
359
360}
361
362
363sub infodb {
364    my $self = shift (@_);
365    my ($doc_obj, $filename) = @_;
366
367    # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
368    my $doctype = $doc_obj->get_doc_type();
369    return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
370
371    my $archivedir = "";
372    if (defined $filename)
373    {
374    # doc_obj derived directly from file
375    my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
376    $dir = "" unless defined $dir;
377    $dir =~ s/\\/\//g;
378    $dir =~ s/^\/+//;
379    $dir =~ s/\/+$//;
380
381    $archivedir = $dir;
382
383    # resolve the final filenames of the files associated with this document
384    $self->assoc_files ($doc_obj, $archivedir);
385    }
386    else
387    {
388    # doc_obj reconstructed from database (has metadata, doc structure but no text)
389    my $top_section = $doc_obj->get_top_section();
390    $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
391    }
392
393    #add this document to the browse structure
394    push(@{$self->{'doclist'}},$doc_obj->get_OID())
395    unless ($doctype eq "classification");
396
397    # classify this document
398    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
399
400    # this is another document
401    $self->{'num_docs'} += 1 unless ($doctype eq "classification");
402
403    # is this a paged or a hierarchical document
404    my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
405
406    my $section = $doc_obj->get_top_section ();
407    my $doc_OID = $doc_obj->get_OID();
408    my $first = 1;
409    my $infodb_handle = $self->{'output_handle'};
410
411    $self->{'doc_mdprefix_fields'} = {};
412
413    while (defined $section)
414    {
415    my $section_OID = $doc_OID;
416    if ($section ne "")
417    {
418        $section_OID = $doc_OID . "." . $section;
419    }
420    my %section_infodb = ();
421
422    # update a few statistics
423    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
424    $self->{'num_sections'} += 1 unless ($doctype eq "classification");
425
426    # output the fact that this document is a document (unless doctype
427    # has been set to something else from within a plugin
428    my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
429    if (!defined $dtype || $dtype !~ /\w/) {
430        $section_infodb{"doctype"} = [ "doc" ];
431    }
432
433    # Output whether this node contains text
434    #
435    # If doc_obj reconstructed from database file then no need to
436    # explicitly add <hastxt> as this is preserved as metadata when
437    # the database file is loaded in
438    if (defined $filename)
439    {
440        # doc_obj derived directly from file
441        if ($doc_obj->get_text_length($section) > 0) {
442        $section_infodb{"hastxt"} = [ "1" ];
443        } else {
444        $section_infodb{"hastxt"} = [ "0" ];
445        }
446    }
447
448    # output all the section metadata
449    my $metadata = $doc_obj->get_all_metadata ($section);
450    foreach my $pair (@$metadata) {
451        my ($field, $value) = (@$pair);
452
453        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
454        defined $value && $value ne "") {       
455
456        # escape problematic stuff
457        $value =~ s/\\/\\\\/g;
458        $value =~ s/\n/\\n/g;
459        $value =~ s/\r/\\r/g;
460
461        # special case for URL metadata
462        if ($field =~ /^URL$/i) {
463            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
464        }
465
466        if (!defined $self->{'dontdb'}->{$field}) {
467            push(@{$section_infodb{$field}}, $value);
468
469            if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
470            {
471            $self->infodb_metadata_stats($field);
472            }
473        }
474        }
475    }
476
477    if ($section eq "")
478    {
479        my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
480
481        foreach my $prefix (keys %$doc_mdprefix_fields)
482        {
483        push(@{$section_infodb{"metadataset"}}, $prefix);
484
485        foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
486        {
487            push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
488
489            my $val = $doc_mdprefix_fields->{$prefix}->{$field};
490            push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
491        }
492        }
493    }
494
495    # If doc_obj reconstructed from database file then no need to
496    # explicitly add <archivedir> as this is preserved as metadata when
497    # the database file is loaded in
498    if (defined $filename)
499    {
500        # output archivedir if at top level
501        if ($section eq $doc_obj->get_top_section()) {
502        $section_infodb{"archivedir"} = [ $archivedir ];
503        }
504    }
505
506    # output document display type
507    if ($first) {
508        $section_infodb{"thistype"} = [ $thistype ];
509    }
510
511    if ($self->{'db_level'} eq "document") {
512        # doc num is num_docs not num_sections
513        # output the matching document number
514        $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
515    }
516    else {
517        # output a list of children
518        my $children = $doc_obj->get_children ($section);
519        if (scalar(@$children) > 0) {
520        $section_infodb{"childtype"} = [ $childtype ];
521        my $contains = "";
522        foreach my $child (@$children)
523        {
524            $contains .= ";" unless ($contains eq "");
525            if ($child =~ /^.*?\.(\d+)$/)
526            {
527            $contains .= "\".$1";
528            }
529            else
530            {
531            $contains .= "\".$child";
532            }
533        }
534        $section_infodb{"contains"} = [ $contains ];
535        }
536        # output the matching doc number
537        $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
538    }
539   
540    &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
541   
542    # output a database entry for the document number, except for Lucene (which no longer needs this information)
543    unless (ref($self) eq "lucenebuildproc")
544    {
545        if ($self->{'db_level'} eq "document") {
546        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
547        }
548        else {
549        &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
550        }
551    }
552
553    $first = 0;
554    $section = $doc_obj->get_next_section($section);
555    last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
556    }
557}
558
559
560sub text {
561    my $self = shift (@_);
562    my ($doc_obj) = @_;
563   
564    my $handle = $self->{'outhandle'};
565    print $handle "basebuildproc::text function must be implemented in sub classes\n";
566    die "\n";
567}
568
569# should the document be indexed - according to the subcollection and language
570# specification.
571sub is_subcollection_doc {
572    my $self = shift (@_);
573    my ($doc_obj) = @_;
574   
575    my $indexed_doc = 1;
576    foreach my $indexexp (@{$self->{'indexexparr'}}) {
577    $indexed_doc = 0;
578    my ($field, $exp, $options) = split /\//, $indexexp;
579    if (defined ($field) && defined ($exp)) {
580        my ($bool) = $field =~ /^(.)/;
581        $field =~ s/^.// if $bool eq '!';
582        my @metadata_values;
583        if ($field =~ /^filename$/i) {
584        push(@metadata_values, $doc_obj->get_source_filename());
585        }
586        else {
587        @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
588        }
589        next unless @metadata_values;
590        foreach my $metadata_value (@metadata_values) {
591        if ($bool eq '!') {
592            if ($options =~ /^i$/i) {
593            if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
594            } else {
595            if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
596            }
597        } else {
598            if ($options =~ /^i$/i) {
599            if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
600            } else {
601            if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
602            }
603        }
604        }
605
606        last if ($indexed_doc == 1);
607    }
608    }
609   
610    # if this doc is so far in the sub collection, and we have lang info,
611    # now we check the languages to see if it matches
612    if($indexed_doc && defined $self->{'lang_meta'}) {
613    $indexed_doc = 0;
614    my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
615    if (defined $field) {
616        foreach my $lang (@{$self->{'langarr'}}) {
617        my ($bool) = $lang =~ /^(.)/;
618        if ($bool eq '!') {
619            $lang =~ s/^.//;
620            if ($field !~ /$lang/) {
621            $indexed_doc = 1; last;
622            }
623        } else {
624            if ($field =~ /$lang/) {
625            $indexed_doc = 1; last;
626            }
627        }
628        }
629    }
630    }
631    return $indexed_doc;
632   
633}
634
635# use 'Paged' if document has no more than 2 levels
636# and each section at second level has a number for
637# Title metadata
638# also use Paged if gsdlthistype metadata is set to Paged
639sub get_document_type {
640    my $self = shift (@_);
641    my ($doc_obj) = @_;
642
643    my $thistype = "VList";
644    my $childtype = "VList";
645    my $title;
646    my @tmp = ();
647   
648    my $section = $doc_obj->get_top_section ();
649   
650    my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
651    if (defined $gsdlthistype) {
652    if ($gsdlthistype eq "Paged") {
653        $childtype = "Paged";
654        if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
655        $thistype = "Paged";
656        } else {
657        $thistype = "Invisible";
658        }
659       
660        return ($thistype, $childtype);
661    } elsif ($gsdlthistype eq "Hierarchy") {
662        return ($thistype, $childtype); # use VList, VList
663    }
664    }
665    my $first = 1;
666    while (defined $section) {
667    @tmp = split /\./, $section;
668    if (scalar(@tmp) > 1) {
669        return ($thistype, $childtype);
670    }
671    if (!$first) {
672        $title = $doc_obj->get_metadata_element ($section, "Title");
673        if (!defined $title || $title !~ /^\d+$/) {
674        return ($thistype, $childtype);
675        }
676    }
677    $first = 0;
678    $section = $doc_obj->get_next_section($section);
679    }
680    if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
681    $thistype = "Paged";
682    } else {
683    $thistype = "Invisible";
684    }
685    $childtype = "Paged";
686    return ($thistype, $childtype);
687}
688
689sub assoc_files() {
690    my $self = shift (@_);
691    my ($doc_obj, $archivedir) = @_;
692    my ($afile);
693   
694    foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
695      #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
696    # if assoc file starts with a slash, we put it relative to the assoc
697    # dir, otherwise it is relative to the HASH... directory
698    if ($assoc_file->[1] =~ m@^[/\\]@) {
699        $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
700    } else {
701        $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
702    }
703    &util::hard_link ($assoc_file->[0], $afile);
704    }
705}
706
Note: See TracBrowser for help on using the browser.