source: gsdl/trunk/perllib/basebuildproc.pm@ 15695

Last change on this file since 15695 was 15695, checked in by mdewsnip, 16 years ago

(Adding new DB support) Starting to tidy up the infodb() function in preparation for separating out the GDBM stuff.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
RevLine 
[9919]1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
[15688]27# implemented by subclass) and storing in the database
[9919]28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use doc;
35use docproc;
36use util;
37
38BEGIN {
39 @basebuildproc::ISA = ('docproc');
40}
41
[12844]42sub new()
43 {
44 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
[9919]45 my $self = new docproc ();
46
47 # outhandle is where all the debugging info goes
48 # output_handle is where the output of the plugins is piped
[15688]49 # to (i.e. mg, database etc.)
[9919]50 $outhandle = STDERR unless defined $outhandle;
51
52 $self->{'collection'} = $collection;
53 $self->{'source_dir'} = $source_dir;
[10159]54 $self->{'build_dir'} = $build_dir;
55 $self->{'keepold'} = $keepold;
56 $self->{'verbosity'} = $verbosity;
57 $self->{'outhandle'} = $outhandle;
[9919]58
59 $self->{'classifiers'} = [];
60 $self->{'mode'} = "text";
61 $self->{'assocdir'} = $build_dir;
[15688]62 $self->{'dontdb'} = {};
[9919]63
64 $self->{'index'} = "section:text";
65 $self->{'indexexparr'} = [];
66
[10159]67 my $found_num_data = 0;
68 my $buildconfigfile = undef;
69
70 if ($keepold) {
71 # For incremental building need to seed num_docs etc from values
72 # stored in build.cfg (if present)
[12844]73 print STDERR "Keepold!\n";
[10159]74 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
[12844]75 print STDERR "Build cfg: $buildconfigfile\n";
[10159]76 if (-e $buildconfigfile) {
77 $found_num_data = 1;
78 }
79 else {
80 # try the index dir
81 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82 "index", "build.cfg");
[12844]83 print STDERR "Index cfg: $buildconfigfile\n";
[10159]84 if (-e $buildconfigfile) {
85 $found_num_data = 1;
86 }
87 }
88
[12844]89 }
90 #else
91 # {
92 # print STDERR "Removeold!\n";
93 # }
[10159]94
[12844]95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
[10159]98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
[12844]100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
[10159]101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
[12844]102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
[10159]103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
[12844]104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
[10159]105 }
[12844]106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
[10159]110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
[12844]112 }
[10159]113
[9919]114 $self->{'output_handle'} = "STDOUT";
[10159]115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
[9919]119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
[15685]122 # what level (section/document) the database - indexer intersection is
123 $self->{'db_level'} = "section";
[9919]124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
[10159]135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
[9919]139
140 $self->{'num_processed_bytes'} = 0;
141}
142
[10159]143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 $self->{'num_bytes'} = 0;
149
150 $self->{'num_processed_bytes'} = 0;
151}
152
[10419]153sub is_incremental_capable
[10304]154{
155 # By default we return 'no' as the answer
156 # Safer to assume non-incremental to start with, and then override in
157 # inherited classes that are.
158
159 return 0;
160}
161
[9919]162sub get_num_docs {
163 my $self = shift (@_);
164
165 return $self->{'num_docs'};
166}
167
168sub get_num_sections {
169 my $self = shift (@_);
170
171 return $self->{'num_sections'};
172}
173
174# num_bytes is the actual number of bytes in the collection
175# this is normally the same as what's processed during text compression
176sub get_num_bytes {
177 my $self = shift (@_);
178
179 return $self->{'num_bytes'};
180}
181
182# num_processed_bytes is the number of bytes actually passed
183# to mg for the current index
184sub get_num_processed_bytes {
185 my $self = shift (@_);
186
187 return $self->{'num_processed_bytes'};
188}
189
190sub set_output_handle {
191 my $self = shift (@_);
192 my ($handle) = @_;
193
194 $self->{'output_handle'} = $handle;
195}
196
197
198sub set_mode {
199 my $self = shift (@_);
200 my ($mode) = @_;
201
202 $self->{'mode'} = $mode;
203}
204
[10159]205sub get_mode {
206 my $self = shift (@_);
207
208 return $self->{'mode'};
209}
210
[9919]211sub set_assocdir {
212 my $self = shift (@_);
213 my ($assocdir) = @_;
214
215 $self->{'assocdir'} = $assocdir;
216}
217
[15688]218sub set_dontdb {
[9919]219 my $self = shift (@_);
[15688]220 my ($dontdb) = @_;
[9919]221
[15688]222 $self->{'dontdb'} = $dontdb;
[9919]223}
224
225sub set_index {
226 my $self = shift (@_);
227 my ($index, $indexexparr) = @_;
228
229 $self->{'index'} = $index;
230 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231}
232
233sub set_index_languages {
234 my $self = shift (@_);
235 my ($lang_meta, $langarr) = @_;
236 $self->{'lang_meta'} = $lang_meta;
237 $self->{'langarr'} = $langarr;
238}
239
240sub get_index {
241 my $self = shift (@_);
242
243 return $self->{'index'};
244}
245
246sub set_classifiers {
247 my $self = shift (@_);
248 my ($classifiers) = @_;
249
250 $self->{'classifiers'} = $classifiers;
251}
252
253sub set_indexing_text {
254 my $self = shift (@_);
255 my ($indexing_text) = @_;
256
257 $self->{'indexing_text'} = $indexing_text;
258}
259
260sub get_indexing_text {
261 my $self = shift (@_);
262
263 return $self->{'indexing_text'};
264}
265
266sub set_store_text {
267 my $self = shift (@_);
268 my ($store_text) = @_;
269
270 $self->{'store_text'} = $store_text;
271}
272sub get_doc_list {
273 my $self = shift(@_);
274
275 return @{$self->{'doclist'}};
276}
277
[15685]278# the standard database level is section, but you may want to change it to document
279sub set_db_level {
[9919]280 my $self= shift (@_);
[15685]281 my ($db_level) = @_;
[9919]282
[15685]283 $self->{'db_level'} = $db_level;
[9919]284}
285
[10469]286sub set_sections_index_document_metadata {
287 my $self= shift (@_);
288 my ($index_type) = @_;
289
290 $self->{'sections_index_document_metadata'} = $index_type;
291}
[9919]292sub process {
293 my $self = shift (@_);
294 my $method = $self->{'mode'};
295
296 $self->$method(@_);
297}
298
[14934]299
300
301sub infodb_metadata_stats
302{
303 my $self = shift (@_);
304 my ($field) = @_;
305
306 # Keep some statistics relating to metadata sets used and
307 # frequency of particular metadata fields within each set
308
309 # Union of metadata prefixes and frequency of fields
310 # (both scoped for this document alone, and across whole collection)
311
312 if ($field =~ m/^(.+)\.(.*)$/) {
313 my $prefix = $1;
314 my $core_field = $2;
315
316 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
317 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
318 }
319 elsif ($field =~ m/^[[:upper:]]/) {
320 # implicit 'ex' metadata set
321
322 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
323 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
324 }
325
326}
327
328
[9919]329sub infodb {
330 my $self = shift (@_);
331 my ($doc_obj, $filename) = @_;
332 my $handle = $self->{'output_handle'};
333
334 my $doctype = $doc_obj->get_doc_type();
335
[15688]336 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
[11793]337 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
[9919]338
[11994]339 my $archivedir = "";
[9919]340
[11994]341 if (defined $filename)
342 {
343 # doc_obj derived directly from file
[9919]344
[11994]345 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
346 $dir = "" unless defined $dir;
347 $dir =~ s/\\/\//g;
348 $dir =~ s/^\/+//;
349 $dir =~ s/\/+$//;
350
351 $archivedir = $dir;
352
353 # resolve the final filenames of the files associated with this document
354 $self->assoc_files ($doc_obj, $archivedir);
355 }
356 else
357 {
[15688]358 # doc_obj reconstructed from database (has metadata, doc structure but no text)
[11994]359 my $top_section = $doc_obj->get_top_section();
360 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
361 }
362
363
[9919]364 #GRB: moved 1/06/2004 from GRB01062004
365 #add this document to the browse structure
366 push(@{$self->{'doclist'}},$doc_obj->get_OID())
367 unless ($doctype eq "classification");
368
369 # classify this document
370 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
371 #GRB: end of moved block
372
373 # this is another document
374 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
375
376 # is this a paged or a hierarchical document
377 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
378
379 my $section = $doc_obj->get_top_section ();
380 my $doc_OID = $doc_obj->get_OID();
381 my $first = 1;
382 my $url = "";
[14934]383
384 $self->{'doc_mdprefix_fields'} = {};
385
[15695]386 while (defined $section)
387 {
388 my $section_OID = $doc_OID;
389 if ($section ne "")
390 {
391 $section_OID = $doc_OID . "." . $section;
392 }
393
[9919]394 # update a few statistics
395 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
396 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
397
398 # output the section name
[15695]399 print $handle "[$section_OID]\n";
[9919]400
401 # output the fact that this document is a document (unless doctype
402 # has been set to something else from within a plugin
403 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
404 if (!defined $dtype || $dtype !~ /\w/) {
405 print $handle "<doctype>doc\n";
406 }
407
[11994]408 # Output whether this node contains text
409 #
[15688]410 # If doc_obj reconstructed from database file then no need to
[11994]411 # explicitly add <hastxt> as this is preserved as metadata when
[15688]412 # the database file is loaded in
[11994]413
414 if (defined $filename)
415 {
416 # doc_obj derived directly from file
417 if ($doc_obj->get_text_length($section) > 0) {
418 print $handle "<hastxt>1\n";
419 } else {
420 print $handle "<hastxt>0\n";
421 }
[9919]422 }
423
424 # output all the section metadata
425 my $metadata = $doc_obj->get_all_metadata ($section);
426 foreach my $pair (@$metadata) {
427 my ($field, $value) = (@$pair);
428
429 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
430 defined $value && $value ne "") {
431
432 # escape problematic stuff
433 $value =~ s/\\/\\\\/g;
434 $value =~ s/\n/\\n/g;
435 $value =~ s/\r/\\r/g;
436 if ($value =~ /-{70,}/) {
437 # if value contains 70 or more hyphens in a row we need
438 # to escape them to prevent txt2db from treating them
439 # as a separator
440 $value =~ s/-/&\#045;/gi;
441 }
442
443 # special case for URL metadata
444 if ($field =~ /^URL$/i) {
445 $url .= "[$value]\n";
[15695]446 $url .= "<section>$section_OID\n";
[9919]447 $url .= '-' x 70 . "\n";
448 }
449
[15688]450 if (!defined $self->{'dontdb'}->{$field}) {
[9919]451 print $handle "<$field>$value\n";
[14934]452
453 if ($section eq "")
454 {
455 $self->infodb_metadata_stats($field);
456 }
[9919]457 }
458 }
459 }
460
[14934]461 if ($section eq "")
462 {
463 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
[11994]464
[14934]465 foreach my $prefix (keys %$doc_mdprefix_fields)
466 {
467 print $handle "<metadataset>$prefix\n";
468
469 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
470 {
471 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
472
473 print $handle "<metadatalist-$prefix>$field\n";
474 print $handle "<metadatafreq-$prefix-$field>$val\n";
475 }
476
477 }
478 }
479
[15688]480 # If doc_obj reconstructed from database file then no need to
[11994]481 # explicitly add <archivedir> as this is preserved as metadata when
[15688]482 # the database file is loaded in
[11994]483
484 if (defined $filename)
485 {
486 # output archivedir if at top level
487 if ($section eq $doc_obj->get_top_section()) {
488 print $handle "<archivedir>$archivedir\n";
489 }
[9919]490 }
491
492 # output document display type
493 if ($first) {
494 print $handle "<thistype>$thistype\n";
495 }
496
[11994]497
[15685]498 if ($self->{'db_level'} eq "document") {
[9919]499 # doc num is num_docs not num_sections
500 # output the matching document number
501 print $handle "<docnum>$self->{'num_docs'}\n";
[12844]502
[9919]503 } else {
504 # output a list of children
505 my $children = $doc_obj->get_children ($section);
506 if (scalar(@$children) > 0) {
507 print $handle "<childtype>$childtype\n";
508 print $handle "<contains>";
509 my $firstchild = 1;
510 foreach my $child (@$children) {
511 print $handle ";" unless $firstchild;
512 $firstchild = 0;
513 if ($child =~ /^.*?\.(\d+)$/) {
514 print $handle "\".$1";
515 } else {
516 print $handle "\".$child";
517 }
518 }
519 print $handle "\n";
520 }
521 #output the matching doc number
522 print $handle "<docnum>$self->{'num_sections'}\n";
523
524 }
525
526 print $handle '-' x 70, "\n";
527
528
529 # output a database entry for the document number
[15685]530 if ($self->{'db_level'} eq "document") {
[9919]531 print $handle "[$self->{'num_docs'}]\n";
532 print $handle "<section>$doc_OID\n";
533 }
534 else {
535 print $handle "[$self->{'num_sections'}]\n";
[15695]536 print $handle "<section>$section_OID\n";
[9919]537 }
538 print $handle '-' x 70, "\n";
539
540 # output entry for url
541 if ($url ne "") {
542 print $handle $url;
543 }
544
545 $first = 0;
546 $section = $doc_obj->get_next_section($section);
[15685]547 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
[9919]548 }
549
550 #GRB01062004: see code above moved from here
551}
552
553
554sub text {
555 my $self = shift (@_);
556 my ($doc_obj) = @_;
557
558 my $handle = $self->{'outhandle'};
559 print $handle "basebuildproc::text function must be implemented in sub classes\n";
560 die "\n";
561}
562
563# should the document be indexed - according to the subcollection and language
564# specification.
565sub is_subcollection_doc {
566 my $self = shift (@_);
567 my ($doc_obj) = @_;
568
569 my $indexed_doc = 1;
570 foreach my $indexexp (@{$self->{'indexexparr'}}) {
571 $indexed_doc = 0;
572 my ($field, $exp, $options) = split /\//, $indexexp;
573 if (defined ($field) && defined ($exp)) {
574 my ($bool) = $field =~ /^(.)/;
575 $field =~ s/^.// if $bool eq '!';
[10028]576 my @metadata_values;
[9919]577 if ($field =~ /^filename$/i) {
[10028]578 push(@metadata_values, $doc_obj->get_source_filename());
[9919]579 }
[10028]580 else {
581 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
582 }
583 next unless @metadata_values;
584 foreach my $metadata_value (@metadata_values) {
585 if ($bool eq '!') {
586 if ($options =~ /^i$/i) {
587 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
588 } else {
589 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
590 }
[9919]591 } else {
[10028]592 if ($options =~ /^i$/i) {
593 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
594 } else {
595 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
596 }
[9919]597 }
598 }
[10028]599
600 last if ($indexed_doc == 1);
[9919]601 }
602 }
603
604 # if this doc is so far in the sub collection, and we have lang info,
605 # now we check the languages to see if it matches
606 if($indexed_doc && defined $self->{'lang_meta'}) {
607 $indexed_doc = 0;
608 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
609 if (defined $field) {
610 foreach my $lang (@{$self->{'langarr'}}) {
611 my ($bool) = $lang =~ /^(.)/;
612 if ($bool eq '!') {
613 $lang =~ s/^.//;
614 if ($field !~ /$lang/) {
615 $indexed_doc = 1; last;
616 }
617 } else {
618 if ($field =~ /$lang/) {
619 $indexed_doc = 1; last;
620 }
621 }
622 }
623 }
624 }
625 return $indexed_doc;
626
627}
628
629# use 'Paged' if document has no more than 2 levels
630# and each section at second level has a number for
631# Title metadata
632# also use Paged if gsdlthistype metadata is set to Paged
633sub get_document_type {
634 my $self = shift (@_);
635 my ($doc_obj) = @_;
636
637 my $thistype = "VList";
638 my $childtype = "VList";
639 my $title;
640 my @tmp = ();
641
642 my $section = $doc_obj->get_top_section ();
643
644 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
645 if (defined $gsdlthistype) {
646 if ($gsdlthistype eq "Paged") {
647 $childtype = "Paged";
648 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
649 $thistype = "Paged";
650 } else {
651 $thistype = "Invisible";
652 }
653
654 return ($thistype, $childtype);
655 } elsif ($gsdlthistype eq "Hierarchy") {
656 return ($thistype, $childtype); # use VList, VList
657 }
658 }
659 my $first = 1;
660 while (defined $section) {
661 @tmp = split /\./, $section;
662 if (scalar(@tmp) > 1) {
663 return ($thistype, $childtype);
664 }
665 if (!$first) {
666 $title = $doc_obj->get_metadata_element ($section, "Title");
667 if (!defined $title || $title !~ /^\d+$/) {
668 return ($thistype, $childtype);
669 }
670 }
671 $first = 0;
672 $section = $doc_obj->get_next_section($section);
673 }
674 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
675 $thistype = "Paged";
676 } else {
677 $thistype = "Invisible";
678 }
679 $childtype = "Paged";
680 return ($thistype, $childtype);
681}
682
[12844]683sub assoc_files() {
[9919]684 my $self = shift (@_);
685 my ($doc_obj, $archivedir) = @_;
686 my ($afile);
687
688 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[12844]689 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
[9919]690 # if assoc file starts with a slash, we put it relative to the assoc
691 # dir, otherwise it is relative to the HASH... directory
692 if ($assoc_file->[1] =~ m@^[/\\]@) {
[12844]693 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
[9919]694 } else {
695 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
696 }
697 &util::hard_link ($assoc_file->[0], $afile);
698 }
699}
700
Note: See TracBrowser for help on using the repository browser.