source: gsdl/trunk/perllib/basebuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
RevLine 
[9919]1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
[15688]27# implemented by subclass) and storing in the database
[9919]28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
[15699]34use dbutil;
[9919]35use doc;
36use docproc;
[15696]37use strict; no strict 'subs';
[9919]38use util;
39
40BEGIN {
41 @basebuildproc::ISA = ('docproc');
42}
43
[12844]44sub new()
45 {
46 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
[9919]47 my $self = new docproc ();
48
49 # outhandle is where all the debugging info goes
50 # output_handle is where the output of the plugins is piped
[15688]51 # to (i.e. mg, database etc.)
[9919]52 $outhandle = STDERR unless defined $outhandle;
53
54 $self->{'collection'} = $collection;
55 $self->{'source_dir'} = $source_dir;
[10159]56 $self->{'build_dir'} = $build_dir;
57 $self->{'keepold'} = $keepold;
58 $self->{'verbosity'} = $verbosity;
59 $self->{'outhandle'} = $outhandle;
[9919]60
61 $self->{'classifiers'} = [];
62 $self->{'mode'} = "text";
63 $self->{'assocdir'} = $build_dir;
[15688]64 $self->{'dontdb'} = {};
[16222]65 $self->{'store_metadata_coverage'} = "false";
[9919]66
67 $self->{'index'} = "section:text";
68 $self->{'indexexparr'} = [];
69
[17110]70 $self->{'separate_cjk'} = 0;
71
[10159]72 my $found_num_data = 0;
73 my $buildconfigfile = undef;
74
75 if ($keepold) {
76 # For incremental building need to seed num_docs etc from values
77 # stored in build.cfg (if present)
78 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
79 if (-e $buildconfigfile) {
80 $found_num_data = 1;
81 }
82 else {
83 # try the index dir
84 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
85 "index", "build.cfg");
86 if (-e $buildconfigfile) {
87 $found_num_data = 1;
88 }
89 }
90
[12844]91 }
[10159]92
[12844]93 if ($found_num_data)
94 {
95 #print STDERR "Found_Num_Data!\n";
[10159]96 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
97 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
[12844]98 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
[10159]99 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
[12844]100 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
[10159]101 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
[12844]102 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
[10159]103 }
[12844]104 else
105 {
106 #print STDERR "NOT Found_Num_Data!\n";
107 $self->{'starting_num_docs'} = 0;
[10159]108 $self->{'starting_num_sections'} = 0;
109 $self->{'starting_num_bytes'} = 0;
[12844]110 }
[10159]111
[9919]112 $self->{'output_handle'} = "STDOUT";
[10159]113 $self->{'num_docs'} = $self->{'starting_num_docs'};
114 $self->{'num_sections'} = $self->{'starting_num_sections'};
115 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
116
[9919]117 $self->{'num_processed_bytes'} = 0;
118 $self->{'store_text'} = 1;
119
[15685]120 # what level (section/document) the database - indexer intersection is
121 $self->{'db_level'} = "section";
[9919]122 #used by browse interface
123 $self->{'doclist'} = [];
124
125 $self->{'indexing_text'} = 0;
126
127 return bless $self, $class;
128
129}
130
131sub reset {
132 my $self = shift (@_);
[10159]133
134 $self->{'num_docs'} = $self->{'starting_num_docs'};
135 $self->{'num_sections'} = $self->{'starting_num_sections'};
136 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
[9919]137
138 $self->{'num_processed_bytes'} = 0;
139}
140
[10159]141sub zero_reset {
142 my $self = shift (@_);
143
144 $self->{'num_docs'} = 0;
145 $self->{'num_sections'} = 0;
146 $self->{'num_bytes'} = 0;
147
148 $self->{'num_processed_bytes'} = 0;
149}
150
[10419]151sub is_incremental_capable
[10304]152{
153 # By default we return 'no' as the answer
154 # Safer to assume non-incremental to start with, and then override in
155 # inherited classes that are.
156
157 return 0;
158}
159
[9919]160sub get_num_docs {
161 my $self = shift (@_);
162
163 return $self->{'num_docs'};
164}
165
166sub get_num_sections {
167 my $self = shift (@_);
168
169 return $self->{'num_sections'};
170}
171
172# num_bytes is the actual number of bytes in the collection
173# this is normally the same as what's processed during text compression
174sub get_num_bytes {
175 my $self = shift (@_);
176
177 return $self->{'num_bytes'};
178}
179
180# num_processed_bytes is the number of bytes actually passed
181# to mg for the current index
182sub get_num_processed_bytes {
183 my $self = shift (@_);
184
185 return $self->{'num_processed_bytes'};
186}
187
188sub set_output_handle {
189 my $self = shift (@_);
190 my ($handle) = @_;
191
192 $self->{'output_handle'} = $handle;
193}
194
195
196sub set_mode {
197 my $self = shift (@_);
198 my ($mode) = @_;
199
200 $self->{'mode'} = $mode;
201}
202
[10159]203sub get_mode {
204 my $self = shift (@_);
205
206 return $self->{'mode'};
207}
208
[9919]209sub set_assocdir {
210 my $self = shift (@_);
211 my ($assocdir) = @_;
212
213 $self->{'assocdir'} = $assocdir;
214}
215
[15688]216sub set_dontdb {
[9919]217 my $self = shift (@_);
[15688]218 my ($dontdb) = @_;
[9919]219
[15688]220 $self->{'dontdb'} = $dontdb;
[9919]221}
222
[15725]223sub set_infodbtype
224{
225 my $self = shift(@_);
226 my $infodbtype = shift(@_);
227 $self->{'infodbtype'} = $infodbtype;
228}
229
[9919]230sub set_index {
231 my $self = shift (@_);
232 my ($index, $indexexparr) = @_;
233
234 $self->{'index'} = $index;
235 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
236}
237
238sub set_index_languages {
239 my $self = shift (@_);
240 my ($lang_meta, $langarr) = @_;
241 $self->{'lang_meta'} = $lang_meta;
242 $self->{'langarr'} = $langarr;
243}
244
245sub get_index {
246 my $self = shift (@_);
247
248 return $self->{'index'};
249}
250
251sub set_classifiers {
252 my $self = shift (@_);
253 my ($classifiers) = @_;
254
255 $self->{'classifiers'} = $classifiers;
256}
257
258sub set_indexing_text {
259 my $self = shift (@_);
260 my ($indexing_text) = @_;
261
262 $self->{'indexing_text'} = $indexing_text;
263}
264
265sub get_indexing_text {
266 my $self = shift (@_);
267
268 return $self->{'indexing_text'};
269}
270
271sub set_store_text {
272 my $self = shift (@_);
273 my ($store_text) = @_;
274
275 $self->{'store_text'} = $store_text;
276}
[16222]277
278sub set_store_metadata_coverage {
279 my $self = shift (@_);
280 my ($store_metadata_coverage) = @_;
281
282 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
283}
284
[9919]285sub get_doc_list {
286 my $self = shift(@_);
287
288 return @{$self->{'doclist'}};
289}
290
[15685]291# the standard database level is section, but you may want to change it to document
292sub set_db_level {
[9919]293 my $self= shift (@_);
[15685]294 my ($db_level) = @_;
[9919]295
[15685]296 $self->{'db_level'} = $db_level;
[9919]297}
298
[10469]299sub set_sections_index_document_metadata {
300 my $self= shift (@_);
301 my ($index_type) = @_;
302
303 $self->{'sections_index_document_metadata'} = $index_type;
304}
[17110]305
306sub set_separate_cjk {
307 my $self = shift (@_);
308 my ($sep_cjk) = @_;
309
310 $self->{'separate_cjk'} = $sep_cjk;
311}
312
[9919]313sub process {
314 my $self = shift (@_);
315 my $method = $self->{'mode'};
316
317 $self->$method(@_);
318}
319
[17110]320# post process text depending on field. Currently don't do anything here
321# except cjk separation
322sub filter_text {
323 my $self = shift (@_);
324 my ($field, $text) = @_;
[14934]325
[17110]326 # lets do cjk seg here
327 my $new_text =$text;
328 if ($self->{'separate_cjk'}) {
329 $new_text = &cnseg::segment($text);
330 }
331 return $new_text;
332}
[14934]333
[17110]334
[14934]335sub infodb_metadata_stats
336{
337 my $self = shift (@_);
338 my ($field) = @_;
339
340 # Keep some statistics relating to metadata sets used and
341 # frequency of particular metadata fields within each set
342
343 # Union of metadata prefixes and frequency of fields
344 # (both scoped for this document alone, and across whole collection)
345
346 if ($field =~ m/^(.+)\.(.*)$/) {
347 my $prefix = $1;
348 my $core_field = $2;
349
350 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
351 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
352 }
353 elsif ($field =~ m/^[[:upper:]]/) {
354 # implicit 'ex' metadata set
355
356 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
357 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
358 }
359
360}
361
362
[9919]363sub infodb {
364 my $self = shift (@_);
365 my ($doc_obj, $filename) = @_;
366
[15696]367 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
[9919]368 my $doctype = $doc_obj->get_doc_type();
[11793]369 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
[9919]370
[11994]371 my $archivedir = "";
372 if (defined $filename)
373 {
374 # doc_obj derived directly from file
375 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
376 $dir = "" unless defined $dir;
377 $dir =~ s/\\/\//g;
378 $dir =~ s/^\/+//;
379 $dir =~ s/\/+$//;
380
381 $archivedir = $dir;
382
383 # resolve the final filenames of the files associated with this document
384 $self->assoc_files ($doc_obj, $archivedir);
385 }
386 else
387 {
[15688]388 # doc_obj reconstructed from database (has metadata, doc structure but no text)
[11994]389 my $top_section = $doc_obj->get_top_section();
390 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
391 }
392
[9919]393 #add this document to the browse structure
394 push(@{$self->{'doclist'}},$doc_obj->get_OID())
395 unless ($doctype eq "classification");
396
397 # classify this document
398 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
399
400 # this is another document
401 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
402
403 # is this a paged or a hierarchical document
404 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
405
406 my $section = $doc_obj->get_top_section ();
407 my $doc_OID = $doc_obj->get_OID();
408 my $first = 1;
[15699]409 my $infodb_handle = $self->{'output_handle'};
[14934]410
411 $self->{'doc_mdprefix_fields'} = {};
412
[15695]413 while (defined $section)
414 {
415 my $section_OID = $doc_OID;
416 if ($section ne "")
417 {
418 $section_OID = $doc_OID . "." . $section;
419 }
[15696]420 my %section_infodb = ();
[15695]421
[9919]422 # update a few statistics
423 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
424 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
425
426 # output the fact that this document is a document (unless doctype
427 # has been set to something else from within a plugin
428 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
429 if (!defined $dtype || $dtype !~ /\w/) {
[15697]430 $section_infodb{"doctype"} = [ "doc" ];
[9919]431 }
432
[11994]433 # Output whether this node contains text
434 #
[15688]435 # If doc_obj reconstructed from database file then no need to
[11994]436 # explicitly add <hastxt> as this is preserved as metadata when
[15688]437 # the database file is loaded in
[11994]438 if (defined $filename)
439 {
440 # doc_obj derived directly from file
441 if ($doc_obj->get_text_length($section) > 0) {
[15697]442 $section_infodb{"hastxt"} = [ "1" ];
[11994]443 } else {
[15697]444 $section_infodb{"hastxt"} = [ "0" ];
[11994]445 }
[9919]446 }
447
448 # output all the section metadata
449 my $metadata = $doc_obj->get_all_metadata ($section);
450 foreach my $pair (@$metadata) {
451 my ($field, $value) = (@$pair);
452
453 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
454 defined $value && $value ne "") {
455
456 # escape problematic stuff
457 $value =~ s/\\/\\\\/g;
458 $value =~ s/\n/\\n/g;
459 $value =~ s/\r/\\r/g;
460
461 # special case for URL metadata
462 if ($field =~ /^URL$/i) {
[15725]463 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
[9919]464 }
465
[15688]466 if (!defined $self->{'dontdb'}->{$field}) {
[15697]467 push(@{$section_infodb{$field}}, $value);
[14934]468
[16222]469 if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
[14934]470 {
471 $self->infodb_metadata_stats($field);
472 }
[9919]473 }
474 }
475 }
476
[14934]477 if ($section eq "")
478 {
479 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
[11994]480
[14934]481 foreach my $prefix (keys %$doc_mdprefix_fields)
482 {
[15697]483 push(@{$section_infodb{"metadataset"}}, $prefix);
[14934]484
485 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
486 {
[15708]487 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
488
[14934]489 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
[15697]490 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
[14934]491 }
492 }
493 }
494
[15688]495 # If doc_obj reconstructed from database file then no need to
[11994]496 # explicitly add <archivedir> as this is preserved as metadata when
[15688]497 # the database file is loaded in
[11994]498 if (defined $filename)
499 {
500 # output archivedir if at top level
501 if ($section eq $doc_obj->get_top_section()) {
[15697]502 $section_infodb{"archivedir"} = [ $archivedir ];
[11994]503 }
[9919]504 }
505
506 # output document display type
507 if ($first) {
[15697]508 $section_infodb{"thistype"} = [ $thistype ];
[9919]509 }
510
[15685]511 if ($self->{'db_level'} eq "document") {
[9919]512 # doc num is num_docs not num_sections
513 # output the matching document number
[15697]514 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
[15696]515 }
516 else {
[9919]517 # output a list of children
518 my $children = $doc_obj->get_children ($section);
519 if (scalar(@$children) > 0) {
[15697]520 $section_infodb{"childtype"} = [ $childtype ];
[15696]521 my $contains = "";
522 foreach my $child (@$children)
523 {
524 $contains .= ";" unless ($contains eq "");
525 if ($child =~ /^.*?\.(\d+)$/)
526 {
527 $contains .= "\".$1";
[9919]528 }
[15698]529 else
530 {
[15696]531 $contains .= "\".$child";
532 }
[9919]533 }
[15697]534 $section_infodb{"contains"} = [ $contains ];
[9919]535 }
[15696]536 # output the matching doc number
[15697]537 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
[9919]538 }
539
[15725]540 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
[9919]541
[17106]542 # output a database entry for the document number, except for Lucene (which no longer needs this information)
543 unless (ref($self) eq "lucenebuildproc")
544 {
545 if ($self->{'db_level'} eq "document") {
546 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
547 }
548 else {
549 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
550 }
[9919]551 }
552
553 $first = 0;
554 $section = $doc_obj->get_next_section($section);
[15685]555 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
[9919]556 }
[15696]557}
[9919]558
[15696]559
[9919]560sub text {
561 my $self = shift (@_);
562 my ($doc_obj) = @_;
563
564 my $handle = $self->{'outhandle'};
565 print $handle "basebuildproc::text function must be implemented in sub classes\n";
566 die "\n";
567}
568
569# should the document be indexed - according to the subcollection and language
570# specification.
571sub is_subcollection_doc {
572 my $self = shift (@_);
573 my ($doc_obj) = @_;
574
575 my $indexed_doc = 1;
576 foreach my $indexexp (@{$self->{'indexexparr'}}) {
577 $indexed_doc = 0;
578 my ($field, $exp, $options) = split /\//, $indexexp;
579 if (defined ($field) && defined ($exp)) {
580 my ($bool) = $field =~ /^(.)/;
581 $field =~ s/^.// if $bool eq '!';
[10028]582 my @metadata_values;
[9919]583 if ($field =~ /^filename$/i) {
[10028]584 push(@metadata_values, $doc_obj->get_source_filename());
[9919]585 }
[10028]586 else {
587 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
588 }
589 next unless @metadata_values;
590 foreach my $metadata_value (@metadata_values) {
591 if ($bool eq '!') {
592 if ($options =~ /^i$/i) {
593 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
594 } else {
595 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
596 }
[9919]597 } else {
[10028]598 if ($options =~ /^i$/i) {
599 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
600 } else {
601 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
602 }
[9919]603 }
604 }
[10028]605
606 last if ($indexed_doc == 1);
[9919]607 }
608 }
609
610 # if this doc is so far in the sub collection, and we have lang info,
611 # now we check the languages to see if it matches
612 if($indexed_doc && defined $self->{'lang_meta'}) {
613 $indexed_doc = 0;
614 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
615 if (defined $field) {
616 foreach my $lang (@{$self->{'langarr'}}) {
617 my ($bool) = $lang =~ /^(.)/;
618 if ($bool eq '!') {
619 $lang =~ s/^.//;
620 if ($field !~ /$lang/) {
621 $indexed_doc = 1; last;
622 }
623 } else {
624 if ($field =~ /$lang/) {
625 $indexed_doc = 1; last;
626 }
627 }
628 }
629 }
630 }
631 return $indexed_doc;
632
633}
634
635# use 'Paged' if document has no more than 2 levels
636# and each section at second level has a number for
637# Title metadata
638# also use Paged if gsdlthistype metadata is set to Paged
639sub get_document_type {
640 my $self = shift (@_);
641 my ($doc_obj) = @_;
642
643 my $thistype = "VList";
644 my $childtype = "VList";
645 my $title;
646 my @tmp = ();
647
648 my $section = $doc_obj->get_top_section ();
649
650 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
651 if (defined $gsdlthistype) {
652 if ($gsdlthistype eq "Paged") {
653 $childtype = "Paged";
654 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
655 $thistype = "Paged";
656 } else {
657 $thistype = "Invisible";
658 }
659
660 return ($thistype, $childtype);
661 } elsif ($gsdlthistype eq "Hierarchy") {
662 return ($thistype, $childtype); # use VList, VList
663 }
664 }
665 my $first = 1;
666 while (defined $section) {
667 @tmp = split /\./, $section;
668 if (scalar(@tmp) > 1) {
669 return ($thistype, $childtype);
670 }
671 if (!$first) {
672 $title = $doc_obj->get_metadata_element ($section, "Title");
673 if (!defined $title || $title !~ /^\d+$/) {
674 return ($thistype, $childtype);
675 }
676 }
677 $first = 0;
678 $section = $doc_obj->get_next_section($section);
679 }
680 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
681 $thistype = "Paged";
682 } else {
683 $thistype = "Invisible";
684 }
685 $childtype = "Paged";
686 return ($thistype, $childtype);
687}
688
[12844]689sub assoc_files() {
[9919]690 my $self = shift (@_);
691 my ($doc_obj, $archivedir) = @_;
692 my ($afile);
693
694 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[12844]695 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
[9919]696 # if assoc file starts with a slash, we put it relative to the assoc
697 # dir, otherwise it is relative to the HASH... directory
698 if ($assoc_file->[1] =~ m@^[/\\]@) {
[12844]699 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
[9919]700 } else {
701 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
702 }
703 &util::hard_link ($assoc_file->[0], $afile);
704 }
705}
706
Note: See TracBrowser for help on using the repository browser.