source: gsdl/trunk/perllib/basebuildproc.pm@ 17564

Last change on this file since 17564 was 17564, checked in by kjdon, 16 years ago

fixed up some stuff to do with indexfieldmap. still working on it, but want to commit what I've done

  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict; no strict 'subs';
38use util;
39
40BEGIN {
41 @basebuildproc::ISA = ('docproc');
42}
43
44sub new()
45 {
46 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
47 my $self = new docproc ();
48
49 # outhandle is where all the debugging info goes
50 # output_handle is where the output of the plugins is piped
51 # to (i.e. mg, database etc.)
52 $outhandle = STDERR unless defined $outhandle;
53
54 $self->{'collection'} = $collection;
55 $self->{'source_dir'} = $source_dir;
56 $self->{'build_dir'} = $build_dir;
57 $self->{'keepold'} = $keepold;
58 $self->{'verbosity'} = $verbosity;
59 $self->{'outhandle'} = $outhandle;
60
61 $self->{'classifiers'} = [];
62 $self->{'mode'} = "text";
63 $self->{'assocdir'} = $build_dir;
64 $self->{'dontdb'} = {};
65 $self->{'store_metadata_coverage'} = "false";
66
67 $self->{'index'} = "section:text";
68 $self->{'indexexparr'} = [];
69
70 $self->{'separate_cjk'} = 0;
71
72 my $found_num_data = 0;
73 my $buildconfigfile = undef;
74
75 if ($keepold) {
76 # For incremental building need to seed num_docs etc from values
77 # stored in build.cfg (if present)
78 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
79 if (-e $buildconfigfile) {
80 $found_num_data = 1;
81 }
82 else {
83 # try the index dir
84 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
85 "index", "build.cfg");
86 if (-e $buildconfigfile) {
87 $found_num_data = 1;
88 }
89 }
90
91 }
92
93 if ($found_num_data)
94 {
95 #print STDERR "Found_Num_Data!\n";
96 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
97 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
98 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
99 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
100 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
101 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
102 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
103 }
104 else
105 {
106 #print STDERR "NOT Found_Num_Data!\n";
107 $self->{'starting_num_docs'} = 0;
108 $self->{'starting_num_sections'} = 0;
109 $self->{'starting_num_bytes'} = 0;
110 }
111
112 $self->{'output_handle'} = "STDOUT";
113 $self->{'num_docs'} = $self->{'starting_num_docs'};
114 $self->{'num_sections'} = $self->{'starting_num_sections'};
115 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
116
117 $self->{'num_processed_bytes'} = 0;
118 $self->{'store_text'} = 1;
119
120 # what level (section/document) the database - indexer intersection is
121 $self->{'db_level'} = "section";
122 #used by browse interface
123 $self->{'doclist'} = [];
124
125 $self->{'indexing_text'} = 0;
126
127 return bless $self, $class;
128
129}
130
131sub reset {
132 my $self = shift (@_);
133
134 $self->{'num_docs'} = $self->{'starting_num_docs'};
135 $self->{'num_sections'} = $self->{'starting_num_sections'};
136 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
137
138 $self->{'num_processed_bytes'} = 0;
139}
140
141sub zero_reset {
142 my $self = shift (@_);
143
144 $self->{'num_docs'} = 0;
145 $self->{'num_sections'} = 0;
146 # reconstructed docs have no text, just metadata, so we need to
147 # remember how many bytes we had initially
148 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
149
150 $self->{'num_processed_bytes'} = 0;
151}
152
153sub is_incremental_capable
154{
155 # By default we return 'no' as the answer
156 # Safer to assume non-incremental to start with, and then override in
157 # inherited classes that are.
158
159 return 0;
160}
161
162sub get_num_docs {
163 my $self = shift (@_);
164
165 return $self->{'num_docs'};
166}
167
168sub get_num_sections {
169 my $self = shift (@_);
170
171 return $self->{'num_sections'};
172}
173
174# num_bytes is the actual number of bytes in the collection
175# this is normally the same as what's processed during text compression
176sub get_num_bytes {
177 my $self = shift (@_);
178
179 return $self->{'num_bytes'};
180}
181
182# num_processed_bytes is the number of bytes actually passed
183# to mg for the current index
184sub get_num_processed_bytes {
185 my $self = shift (@_);
186
187 return $self->{'num_processed_bytes'};
188}
189
190sub set_output_handle {
191 my $self = shift (@_);
192 my ($handle) = @_;
193
194 $self->{'output_handle'} = $handle;
195}
196
197
198sub set_mode {
199 my $self = shift (@_);
200 my ($mode) = @_;
201
202 $self->{'mode'} = $mode;
203}
204
205sub get_mode {
206 my $self = shift (@_);
207
208 return $self->{'mode'};
209}
210
211sub set_assocdir {
212 my $self = shift (@_);
213 my ($assocdir) = @_;
214
215 $self->{'assocdir'} = $assocdir;
216}
217
218sub set_dontdb {
219 my $self = shift (@_);
220 my ($dontdb) = @_;
221
222 $self->{'dontdb'} = $dontdb;
223}
224
225sub set_infodbtype
226{
227 my $self = shift(@_);
228 my $infodbtype = shift(@_);
229 $self->{'infodbtype'} = $infodbtype;
230}
231
232sub set_index {
233 my $self = shift (@_);
234 my ($index, $indexexparr) = @_;
235
236 $self->{'index'} = $index;
237 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
238}
239
240sub set_index_languages {
241 my $self = shift (@_);
242 my ($lang_meta, $langarr) = @_;
243 $self->{'lang_meta'} = $lang_meta;
244 $self->{'langarr'} = $langarr;
245}
246
247sub get_index {
248 my $self = shift (@_);
249
250 return $self->{'index'};
251}
252
253sub set_classifiers {
254 my $self = shift (@_);
255 my ($classifiers) = @_;
256
257 $self->{'classifiers'} = $classifiers;
258}
259
260sub set_indexing_text {
261 my $self = shift (@_);
262 my ($indexing_text) = @_;
263
264 $self->{'indexing_text'} = $indexing_text;
265}
266
267sub get_indexing_text {
268 my $self = shift (@_);
269
270 return $self->{'indexing_text'};
271}
272
273sub set_store_text {
274 my $self = shift (@_);
275 my ($store_text) = @_;
276
277 $self->{'store_text'} = $store_text;
278}
279
280sub set_store_metadata_coverage {
281 my $self = shift (@_);
282 my ($store_metadata_coverage) = @_;
283
284 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
285}
286
287sub get_doc_list {
288 my $self = shift(@_);
289
290 return @{$self->{'doclist'}};
291}
292
293# the standard database level is section, but you may want to change it to document
294sub set_db_level {
295 my $self= shift (@_);
296 my ($db_level) = @_;
297
298 $self->{'db_level'} = $db_level;
299}
300
301sub set_sections_index_document_metadata {
302 my $self= shift (@_);
303 my ($index_type) = @_;
304
305 $self->{'sections_index_document_metadata'} = $index_type;
306}
307
308sub set_separate_cjk {
309 my $self = shift (@_);
310 my ($sep_cjk) = @_;
311
312 $self->{'separate_cjk'} = $sep_cjk;
313}
314
315sub process {
316 my $self = shift (@_);
317 my $method = $self->{'mode'};
318
319 $self->$method(@_);
320}
321
322# post process text depending on field. Currently don't do anything here
323# except cjk separation, and only for indexing
324# should only do this for indexed text (if $self->{'indexing_text'}),
325# but currently search term highlighting doesn't work if you do that.
326# once thats fixed up, then fix this.
327sub filter_text {
328 my $self = shift (@_);
329 my ($field, $text) = @_;
330
331 # lets do cjk seg here
332 my $new_text =$text;
333 if ($self->{'separate_cjk'}) {
334 $new_text = &cnseg::segment($text);
335 }
336 return $new_text;
337}
338
339
340sub infodb_metadata_stats
341{
342 my $self = shift (@_);
343 my ($field) = @_;
344
345 # Keep some statistics relating to metadata sets used and
346 # frequency of particular metadata fields within each set
347
348 # Union of metadata prefixes and frequency of fields
349 # (both scoped for this document alone, and across whole collection)
350
351 if ($field =~ m/^(.+)\.(.*)$/) {
352 my $prefix = $1;
353 my $core_field = $2;
354
355 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
356 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
357 }
358 elsif ($field =~ m/^[[:upper:]]/) {
359 # implicit 'ex' metadata set
360
361 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
362 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
363 }
364
365}
366
367
368sub infodb {
369 my $self = shift (@_);
370 my ($doc_obj, $filename) = @_;
371
372 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
373 my $doctype = $doc_obj->get_doc_type();
374 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
375
376 my $archivedir = "";
377 if (defined $filename)
378 {
379 # doc_obj derived directly from file
380 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
381 $dir = "" unless defined $dir;
382 $dir =~ s/\\/\//g;
383 $dir =~ s/^\/+//;
384 $dir =~ s/\/+$//;
385
386 $archivedir = $dir;
387
388 # resolve the final filenames of the files associated with this document
389 $self->assoc_files ($doc_obj, $archivedir);
390 }
391 else
392 {
393 # doc_obj reconstructed from database (has metadata, doc structure but no text)
394 my $top_section = $doc_obj->get_top_section();
395 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
396 }
397
398 #add this document to the browse structure
399 push(@{$self->{'doclist'}},$doc_obj->get_OID())
400 unless ($doctype eq "classification");
401
402 # classify this document
403 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
404
405 # this is another document
406 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
407
408 # is this a paged or a hierarchical document
409 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
410
411 my $section = $doc_obj->get_top_section ();
412 my $doc_OID = $doc_obj->get_OID();
413 my $first = 1;
414 my $infodb_handle = $self->{'output_handle'};
415
416 $self->{'doc_mdprefix_fields'} = {};
417
418 while (defined $section)
419 {
420 my $section_OID = $doc_OID;
421 if ($section ne "")
422 {
423 $section_OID = $doc_OID . "." . $section;
424 }
425 my %section_infodb = ();
426
427 # update a few statistics
428 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
429 print STDERR "num bytes added = ".$doc_obj->get_text_length ($section)."\n";
430 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
431
432 # output the fact that this document is a document (unless doctype
433 # has been set to something else from within a plugin
434 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
435 if (!defined $dtype || $dtype !~ /\w/) {
436 $section_infodb{"doctype"} = [ "doc" ];
437 }
438
439 # Output whether this node contains text
440 #
441 # If doc_obj reconstructed from database file then no need to
442 # explicitly add <hastxt> as this is preserved as metadata when
443 # the database file is loaded in
444 if (defined $filename)
445 {
446 # doc_obj derived directly from file
447 if ($doc_obj->get_text_length($section) > 0) {
448 $section_infodb{"hastxt"} = [ "1" ];
449 } else {
450 $section_infodb{"hastxt"} = [ "0" ];
451 }
452 }
453
454 # output all the section metadata
455 my $metadata = $doc_obj->get_all_metadata ($section);
456 foreach my $pair (@$metadata) {
457 my ($field, $value) = (@$pair);
458
459 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
460 defined $value && $value ne "") {
461
462 # escape problematic stuff
463 $value =~ s/\\/\\\\/g;
464 $value =~ s/\n/\\n/g;
465 $value =~ s/\r/\\r/g;
466
467 # special case for URL metadata
468 if ($field =~ /^URL$/i) {
469 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
470 }
471
472 if (!defined $self->{'dontdb'}->{$field}) {
473 push(@{$section_infodb{$field}}, $value);
474
475 if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
476 {
477 $self->infodb_metadata_stats($field);
478 }
479 }
480 }
481 }
482
483 if ($section eq "")
484 {
485 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
486
487 foreach my $prefix (keys %$doc_mdprefix_fields)
488 {
489 push(@{$section_infodb{"metadataset"}}, $prefix);
490
491 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
492 {
493 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
494
495 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
496 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
497 }
498 }
499 }
500
501 # If doc_obj reconstructed from database file then no need to
502 # explicitly add <archivedir> as this is preserved as metadata when
503 # the database file is loaded in
504 if (defined $filename)
505 {
506 # output archivedir if at top level
507 if ($section eq $doc_obj->get_top_section()) {
508 $section_infodb{"archivedir"} = [ $archivedir ];
509 }
510 }
511
512 # output document display type
513 if ($first) {
514 $section_infodb{"thistype"} = [ $thistype ];
515 }
516
517 if ($self->{'db_level'} eq "document") {
518 # doc num is num_docs not num_sections
519 # output the matching document number
520 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
521 }
522 else {
523 # output a list of children
524 my $children = $doc_obj->get_children ($section);
525 if (scalar(@$children) > 0) {
526 $section_infodb{"childtype"} = [ $childtype ];
527 my $contains = "";
528 foreach my $child (@$children)
529 {
530 $contains .= ";" unless ($contains eq "");
531 if ($child =~ /^.*?\.(\d+)$/)
532 {
533 $contains .= "\".$1";
534 }
535 else
536 {
537 $contains .= "\".$child";
538 }
539 }
540 $section_infodb{"contains"} = [ $contains ];
541 }
542 # output the matching doc number
543 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
544 }
545
546 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
547
548 # output a database entry for the document number, except for Lucene (which no longer needs this information)
549 unless (ref($self) eq "lucenebuildproc")
550 {
551 if ($self->{'db_level'} eq "document") {
552 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
553 }
554 else {
555 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
556 }
557 }
558
559 $first = 0;
560 $section = $doc_obj->get_next_section($section);
561 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
562 }
563}
564
565
566sub text {
567 my $self = shift (@_);
568 my ($doc_obj) = @_;
569
570 my $handle = $self->{'outhandle'};
571 print $handle "basebuildproc::text function must be implemented in sub classes\n";
572 die "\n";
573}
574
575# should the document be indexed - according to the subcollection and language
576# specification.
577sub is_subcollection_doc {
578 my $self = shift (@_);
579 my ($doc_obj) = @_;
580
581 my $indexed_doc = 1;
582 foreach my $indexexp (@{$self->{'indexexparr'}}) {
583 $indexed_doc = 0;
584 my ($field, $exp, $options) = split /\//, $indexexp;
585 if (defined ($field) && defined ($exp)) {
586 my ($bool) = $field =~ /^(.)/;
587 $field =~ s/^.// if $bool eq '!';
588 my @metadata_values;
589 if ($field =~ /^filename$/i) {
590 push(@metadata_values, $doc_obj->get_source_filename());
591 }
592 else {
593 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
594 }
595 next unless @metadata_values;
596 foreach my $metadata_value (@metadata_values) {
597 if ($bool eq '!') {
598 if ($options =~ /^i$/i) {
599 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
600 } else {
601 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
602 }
603 } else {
604 if ($options =~ /^i$/i) {
605 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
606 } else {
607 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
608 }
609 }
610 }
611
612 last if ($indexed_doc == 1);
613 }
614 }
615
616 # if this doc is so far in the sub collection, and we have lang info,
617 # now we check the languages to see if it matches
618 if($indexed_doc && defined $self->{'lang_meta'}) {
619 $indexed_doc = 0;
620 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
621 if (defined $field) {
622 foreach my $lang (@{$self->{'langarr'}}) {
623 my ($bool) = $lang =~ /^(.)/;
624 if ($bool eq '!') {
625 $lang =~ s/^.//;
626 if ($field !~ /$lang/) {
627 $indexed_doc = 1; last;
628 }
629 } else {
630 if ($field =~ /$lang/) {
631 $indexed_doc = 1; last;
632 }
633 }
634 }
635 }
636 }
637 return $indexed_doc;
638
639}
640
641# use 'Paged' if document has no more than 2 levels
642# and each section at second level has a number for
643# Title metadata
644# also use Paged if gsdlthistype metadata is set to Paged
645sub get_document_type {
646 my $self = shift (@_);
647 my ($doc_obj) = @_;
648
649 my $thistype = "VList";
650 my $childtype = "VList";
651 my $title;
652 my @tmp = ();
653
654 my $section = $doc_obj->get_top_section ();
655
656 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
657 if (defined $gsdlthistype) {
658 if ($gsdlthistype eq "Paged") {
659 $childtype = "Paged";
660 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
661 $thistype = "Paged";
662 } else {
663 $thistype = "Invisible";
664 }
665
666 return ($thistype, $childtype);
667 } elsif ($gsdlthistype eq "Hierarchy") {
668 return ($thistype, $childtype); # use VList, VList
669 }
670 }
671 my $first = 1;
672 while (defined $section) {
673 @tmp = split /\./, $section;
674 if (scalar(@tmp) > 1) {
675 return ($thistype, $childtype);
676 }
677 if (!$first) {
678 $title = $doc_obj->get_metadata_element ($section, "Title");
679 if (!defined $title || $title !~ /^\d+$/) {
680 return ($thistype, $childtype);
681 }
682 }
683 $first = 0;
684 $section = $doc_obj->get_next_section($section);
685 }
686 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
687 $thistype = "Paged";
688 } else {
689 $thistype = "Invisible";
690 }
691 $childtype = "Paged";
692 return ($thistype, $childtype);
693}
694
695sub assoc_files() {
696 my $self = shift (@_);
697 my ($doc_obj, $archivedir) = @_;
698 my ($afile);
699
700 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
701 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
702 # if assoc file starts with a slash, we put it relative to the assoc
703 # dir, otherwise it is relative to the HASH... directory
704 if ($assoc_file->[1] =~ m@^[/\\]@) {
705 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
706 } else {
707 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
708 }
709 &util::hard_link ($assoc_file->[0], $afile);
710 }
711}
712
Note: See TracBrowser for help on using the repository browser.