source: gsdl/trunk/perllib/basebuildproc.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in gdbm database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use doc;
35use docproc;
36use util;
37
38BEGIN {
39 @basebuildproc::ISA = ('docproc');
40}
41
42sub new()
43 {
44 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
45 my $self = new docproc ();
46
47 # outhandle is where all the debugging info goes
48 # output_handle is where the output of the plugins is piped
49 # to (i.e. mg, gdbm etc.)
50 $outhandle = STDERR unless defined $outhandle;
51
52 $self->{'collection'} = $collection;
53 $self->{'source_dir'} = $source_dir;
54 $self->{'build_dir'} = $build_dir;
55 $self->{'keepold'} = $keepold;
56 $self->{'verbosity'} = $verbosity;
57 $self->{'outhandle'} = $outhandle;
58
59 $self->{'classifiers'} = [];
60 $self->{'mode'} = "text";
61 $self->{'assocdir'} = $build_dir;
62 $self->{'dontgdbm'} = {};
63
64 $self->{'index'} = "section:text";
65 $self->{'indexexparr'} = [];
66
67 my $found_num_data = 0;
68 my $buildconfigfile = undef;
69
70 if ($keepold) {
71 # For incremental building need to seed num_docs etc from values
72 # stored in build.cfg (if present)
73 print STDERR "Keepold!\n";
74 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
75 print STDERR "Build cfg: $buildconfigfile\n";
76 if (-e $buildconfigfile) {
77 $found_num_data = 1;
78 }
79 else {
80 # try the index dir
81 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82 "index", "build.cfg");
83 print STDERR "Index cfg: $buildconfigfile\n";
84 if (-e $buildconfigfile) {
85 $found_num_data = 1;
86 }
87 }
88
89 }
90 #else
91 # {
92 # print STDERR "Removeold!\n";
93 # }
94
95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105 }
106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
112 }
113
114 $self->{'output_handle'} = "STDOUT";
115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
122 # what level (section/document) the gdbm database - indexer intersection is
123 $self->{'gdbm_level'} = "section";
124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140 $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 $self->{'num_bytes'} = 0;
149
150 $self->{'num_processed_bytes'} = 0;
151}
152
153sub is_incremental_capable
154{
155 # By default we return 'no' as the answer
156 # Safer to assume non-incremental to start with, and then override in
157 # inherited classes that are.
158
159 return 0;
160}
161
162sub get_num_docs {
163 my $self = shift (@_);
164
165 return $self->{'num_docs'};
166}
167
168sub get_num_sections {
169 my $self = shift (@_);
170
171 return $self->{'num_sections'};
172}
173
174# num_bytes is the actual number of bytes in the collection
175# this is normally the same as what's processed during text compression
176sub get_num_bytes {
177 my $self = shift (@_);
178
179 return $self->{'num_bytes'};
180}
181
182# num_processed_bytes is the number of bytes actually passed
183# to mg for the current index
184sub get_num_processed_bytes {
185 my $self = shift (@_);
186
187 return $self->{'num_processed_bytes'};
188}
189
190sub set_output_handle {
191 my $self = shift (@_);
192 my ($handle) = @_;
193
194 $self->{'output_handle'} = $handle;
195}
196
197
198sub set_mode {
199 my $self = shift (@_);
200 my ($mode) = @_;
201
202 $self->{'mode'} = $mode;
203}
204
205sub get_mode {
206 my $self = shift (@_);
207
208 return $self->{'mode'};
209}
210
211sub set_assocdir {
212 my $self = shift (@_);
213 my ($assocdir) = @_;
214
215 $self->{'assocdir'} = $assocdir;
216}
217
218sub set_dontgdbm {
219 my $self = shift (@_);
220 my ($dontgdbm) = @_;
221
222 $self->{'dontgdbm'} = $dontgdbm;
223}
224
225sub set_index {
226 my $self = shift (@_);
227 my ($index, $indexexparr) = @_;
228
229 $self->{'index'} = $index;
230 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231}
232
233sub set_index_languages {
234 my $self = shift (@_);
235 my ($lang_meta, $langarr) = @_;
236 $self->{'lang_meta'} = $lang_meta;
237 $self->{'langarr'} = $langarr;
238}
239
240sub get_index {
241 my $self = shift (@_);
242
243 return $self->{'index'};
244}
245
246sub set_classifiers {
247 my $self = shift (@_);
248 my ($classifiers) = @_;
249
250 $self->{'classifiers'} = $classifiers;
251}
252
253sub set_indexing_text {
254 my $self = shift (@_);
255 my ($indexing_text) = @_;
256
257 $self->{'indexing_text'} = $indexing_text;
258}
259
260sub get_indexing_text {
261 my $self = shift (@_);
262
263 return $self->{'indexing_text'};
264}
265
266sub set_store_text {
267 my $self = shift (@_);
268 my ($store_text) = @_;
269
270 $self->{'store_text'} = $store_text;
271}
272sub get_doc_list {
273 my $self = shift(@_);
274
275 return @{$self->{'doclist'}};
276}
277
278# the standard gdbm level is section, but you may want to change it to document
279sub set_gdbm_level {
280 my $self= shift (@_);
281 my ($gdbm_level) = @_;
282
283 $self->{'gdbm_level'} = $gdbm_level;
284}
285
286sub set_sections_index_document_metadata {
287 my $self= shift (@_);
288 my ($index_type) = @_;
289
290 $self->{'sections_index_document_metadata'} = $index_type;
291}
292sub process {
293 my $self = shift (@_);
294 my $method = $self->{'mode'};
295
296 $self->$method(@_);
297}
298
299
300
301sub infodb_metadata_stats
302{
303 my $self = shift (@_);
304 my ($field) = @_;
305
306 # Keep some statistics relating to metadata sets used and
307 # frequency of particular metadata fields within each set
308
309 # Union of metadata prefixes and frequency of fields
310 # (both scoped for this document alone, and across whole collection)
311
312 if ($field =~ m/^(.+)\.(.*)$/) {
313 my $prefix = $1;
314 my $core_field = $2;
315
316 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
317 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
318 }
319 elsif ($field =~ m/^[[:upper:]]/) {
320 # implicit 'ex' metadata set
321
322 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
323 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
324 }
325
326}
327
328
329sub infodb {
330 my $self = shift (@_);
331 my ($doc_obj, $filename) = @_;
332 my $handle = $self->{'output_handle'};
333
334 my $doctype = $doc_obj->get_doc_type();
335
336 # only output this document if it is a "indexed_doc" or "info_doc" (GDBM database only) document
337 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
338
339 my $archivedir = "";
340
341 if (defined $filename)
342 {
343 # doc_obj derived directly from file
344
345 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
346 $dir = "" unless defined $dir;
347 $dir =~ s/\\/\//g;
348 $dir =~ s/^\/+//;
349 $dir =~ s/\/+$//;
350
351 $archivedir = $dir;
352
353 # resolve the final filenames of the files associated with this document
354 $self->assoc_files ($doc_obj, $archivedir);
355 }
356 else
357 {
358 # doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
359 my $top_section = $doc_obj->get_top_section();
360 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
361 }
362
363
364 #GRB: moved 1/06/2004 from GRB01062004
365 #add this document to the browse structure
366 push(@{$self->{'doclist'}},$doc_obj->get_OID())
367 unless ($doctype eq "classification");
368
369 # classify this document
370 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
371 #GRB: end of moved block
372
373 # this is another document
374 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
375
376 # is this a paged or a hierarchical document
377 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
378
379 my $section = $doc_obj->get_top_section ();
380 my $doc_OID = $doc_obj->get_OID();
381 my $first = 1;
382 my $url = "";
383
384 $self->{'doc_mdprefix_fields'} = {};
385
386 while (defined $section) {
387 # update a few statistics
388 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
389 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
390
391 # output the section name
392 if ($section eq "") { print $handle "[$doc_OID]\n"; }
393 else { print $handle "[$doc_OID.$section]\n"; }
394
395 # output the fact that this document is a document (unless doctype
396 # has been set to something else from within a plugin
397 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
398 if (!defined $dtype || $dtype !~ /\w/) {
399 print $handle "<doctype>doc\n";
400 }
401
402 # Output whether this node contains text
403 #
404 # If doc_obj reconstructed from GDBM file then no need to
405 # explicitly add <hastxt> as this is preserved as metadata when
406 # the GDBM file is loaded in
407
408 if (defined $filename)
409 {
410 # doc_obj derived directly from file
411 if ($doc_obj->get_text_length($section) > 0) {
412 print $handle "<hastxt>1\n";
413 } else {
414 print $handle "<hastxt>0\n";
415 }
416 }
417
418 # output all the section metadata
419 my $metadata = $doc_obj->get_all_metadata ($section);
420 foreach my $pair (@$metadata) {
421 my ($field, $value) = (@$pair);
422
423 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
424 defined $value && $value ne "") {
425
426 # escape problematic stuff
427 $value =~ s/\\/\\\\/g;
428 $value =~ s/\n/\\n/g;
429 $value =~ s/\r/\\r/g;
430 if ($value =~ /-{70,}/) {
431 # if value contains 70 or more hyphens in a row we need
432 # to escape them to prevent txt2db from treating them
433 # as a separator
434 $value =~ s/-/&\#045;/gi;
435 }
436
437 # special case for URL metadata
438 if ($field =~ /^URL$/i) {
439 $url .= "[$value]\n";
440 if ($section eq "") {$url .= "<section>$doc_OID\n";}
441 else {$url .= "<section>$doc_OID.$section\n";}
442 $url .= '-' x 70 . "\n";
443 }
444
445 if (!defined $self->{'dontgdbm'}->{$field}) {
446 print $handle "<$field>$value\n";
447
448 if ($section eq "")
449 {
450 $self->infodb_metadata_stats($field);
451 }
452 }
453 }
454 }
455
456 if ($section eq "")
457 {
458 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
459
460 foreach my $prefix (keys %$doc_mdprefix_fields)
461 {
462 print $handle "<metadataset>$prefix\n";
463
464 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
465 {
466 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
467
468 print $handle "<metadatalist-$prefix>$field\n";
469 print $handle "<metadatafreq-$prefix-$field>$val\n";
470 }
471
472 }
473 }
474
475 # If doc_obj reconstructed from GDBM file then no need to
476 # explicitly add <archivedir> as this is preserved as metadata when
477 # the GDBM file is loaded in
478
479 if (defined $filename)
480 {
481 # output archivedir if at top level
482 if ($section eq $doc_obj->get_top_section()) {
483 print $handle "<archivedir>$archivedir\n";
484 }
485 }
486
487 # output document display type
488 if ($first) {
489 print $handle "<thistype>$thistype\n";
490 }
491
492
493 if ($self->{'gdbm_level'} eq "document") {
494 # doc num is num_docs not num_sections
495 # output the matching document number
496 print $handle "<docnum>$self->{'num_docs'}\n";
497
498 } else {
499 # output a list of children
500 my $children = $doc_obj->get_children ($section);
501 if (scalar(@$children) > 0) {
502 print $handle "<childtype>$childtype\n";
503 print $handle "<contains>";
504 my $firstchild = 1;
505 foreach my $child (@$children) {
506 print $handle ";" unless $firstchild;
507 $firstchild = 0;
508 if ($child =~ /^.*?\.(\d+)$/) {
509 print $handle "\".$1";
510 } else {
511 print $handle "\".$child";
512 }
513# if ($child eq "") { print $handle "$doc_OID"; }
514# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
515# else { print $handle "$doc_OID.$section.$child"; }
516 }
517 print $handle "\n";
518 }
519 #output the matching doc number
520 print $handle "<docnum>$self->{'num_sections'}\n";
521
522 }
523
524 print $handle '-' x 70, "\n";
525
526
527 # output a database entry for the document number
528 if ($self->{'gdbm_level'} eq "document") {
529 print $handle "[$self->{'num_docs'}]\n";
530 print $handle "<section>$doc_OID\n";
531 }
532 else {
533 print $handle "[$self->{'num_sections'}]\n";
534 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
535 else { print $handle "<section>$doc_OID.$section\n"; }
536 }
537 print $handle '-' x 70, "\n";
538
539 # output entry for url
540 if ($url ne "") {
541 print $handle $url;
542 }
543
544 $first = 0;
545 $section = $doc_obj->get_next_section($section);
546 last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
547 }
548
549 #GRB01062004: see code above moved from here
550}
551
552
553sub text {
554 my $self = shift (@_);
555 my ($doc_obj) = @_;
556
557 my $handle = $self->{'outhandle'};
558 print $handle "basebuildproc::text function must be implemented in sub classes\n";
559 die "\n";
560}
561
562# should the document be indexed - according to the subcollection and language
563# specification.
564sub is_subcollection_doc {
565 my $self = shift (@_);
566 my ($doc_obj) = @_;
567
568 my $indexed_doc = 1;
569 foreach my $indexexp (@{$self->{'indexexparr'}}) {
570 $indexed_doc = 0;
571 my ($field, $exp, $options) = split /\//, $indexexp;
572 if (defined ($field) && defined ($exp)) {
573 my ($bool) = $field =~ /^(.)/;
574 $field =~ s/^.// if $bool eq '!';
575 my @metadata_values;
576 if ($field =~ /^filename$/i) {
577 push(@metadata_values, $doc_obj->get_source_filename());
578 }
579 else {
580 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
581 }
582 next unless @metadata_values;
583 foreach my $metadata_value (@metadata_values) {
584 if ($bool eq '!') {
585 if ($options =~ /^i$/i) {
586 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
587 } else {
588 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
589 }
590 } else {
591 if ($options =~ /^i$/i) {
592 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
593 } else {
594 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
595 }
596 }
597 }
598
599 last if ($indexed_doc == 1);
600 }
601 }
602
603 # if this doc is so far in the sub collection, and we have lang info,
604 # now we check the languages to see if it matches
605 if($indexed_doc && defined $self->{'lang_meta'}) {
606 $indexed_doc = 0;
607 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
608 if (defined $field) {
609 foreach my $lang (@{$self->{'langarr'}}) {
610 my ($bool) = $lang =~ /^(.)/;
611 if ($bool eq '!') {
612 $lang =~ s/^.//;
613 if ($field !~ /$lang/) {
614 $indexed_doc = 1; last;
615 }
616 } else {
617 if ($field =~ /$lang/) {
618 $indexed_doc = 1; last;
619 }
620 }
621 }
622 }
623 }
624 return $indexed_doc;
625
626}
627
628# use 'Paged' if document has no more than 2 levels
629# and each section at second level has a number for
630# Title metadata
631# also use Paged if gsdlthistype metadata is set to Paged
632sub get_document_type {
633 my $self = shift (@_);
634 my ($doc_obj) = @_;
635
636 my $thistype = "VList";
637 my $childtype = "VList";
638 my $title;
639 my @tmp = ();
640
641 my $section = $doc_obj->get_top_section ();
642
643 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
644 if (defined $gsdlthistype) {
645 if ($gsdlthistype eq "Paged") {
646 $childtype = "Paged";
647 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
648 $thistype = "Paged";
649 } else {
650 $thistype = "Invisible";
651 }
652
653 return ($thistype, $childtype);
654 } elsif ($gsdlthistype eq "Hierarchy") {
655 return ($thistype, $childtype); # use VList, VList
656 }
657 }
658 my $first = 1;
659 while (defined $section) {
660 @tmp = split /\./, $section;
661 if (scalar(@tmp) > 1) {
662 return ($thistype, $childtype);
663 }
664 if (!$first) {
665 $title = $doc_obj->get_metadata_element ($section, "Title");
666 if (!defined $title || $title !~ /^\d+$/) {
667 return ($thistype, $childtype);
668 }
669 }
670 $first = 0;
671 $section = $doc_obj->get_next_section($section);
672 }
673 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
674 $thistype = "Paged";
675 } else {
676 $thistype = "Invisible";
677 }
678 $childtype = "Paged";
679 return ($thistype, $childtype);
680}
681
682sub assoc_files() {
683 my $self = shift (@_);
684 my ($doc_obj, $archivedir) = @_;
685 my ($afile);
686
687 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
688 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
689 # if assoc file starts with a slash, we put it relative to the assoc
690 # dir, otherwise it is relative to the HASH... directory
691 if ($assoc_file->[1] =~ m@^[/\\]@) {
692 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
693 } else {
694 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
695 }
696 &util::hard_link ($assoc_file->[0], $afile);
697 }
698}
699
Note: See TracBrowser for help on using the repository browser.