source: trunk/gsdl/perllib/basebuildproc.pm@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1###########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in gdbm database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use doc;
35use docproc;
36use util;
37
38BEGIN {
39 @basebuildproc::ISA = ('docproc');
40}
41
42sub new()
43 {
44 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
45 my $self = new docproc ();
46
47 # outhandle is where all the debugging info goes
48 # output_handle is where the output of the plugins is piped
49 # to (i.e. mg, gdbm etc.)
50 $outhandle = STDERR unless defined $outhandle;
51
52 $self->{'collection'} = $collection;
53 $self->{'source_dir'} = $source_dir;
54 $self->{'build_dir'} = $build_dir;
55 $self->{'keepold'} = $keepold;
56 $self->{'verbosity'} = $verbosity;
57 $self->{'outhandle'} = $outhandle;
58
59 $self->{'classifiers'} = [];
60 $self->{'mode'} = "text";
61 $self->{'assocdir'} = $build_dir;
62 $self->{'dontgdbm'} = {};
63
64 $self->{'index'} = "section:text";
65 $self->{'indexexparr'} = [];
66
67 my $found_num_data = 0;
68 my $buildconfigfile = undef;
69
70 if ($keepold) {
71 # For incremental building need to seed num_docs etc from values
72 # stored in build.cfg (if present)
73 print STDERR "Keepold!\n";
74 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
75 print STDERR "Build cfg: $buildconfigfile\n";
76 if (-e $buildconfigfile) {
77 $found_num_data = 1;
78 }
79 else {
80 # try the index dir
81 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82 "index", "build.cfg");
83 print STDERR "Index cfg: $buildconfigfile\n";
84 if (-e $buildconfigfile) {
85 $found_num_data = 1;
86 }
87 }
88
89 }
90 #else
91 # {
92 # print STDERR "Removeold!\n";
93 # }
94
95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105 }
106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
112 }
113
114 $self->{'output_handle'} = "STDOUT";
115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
122 # what level (section/document) the gdbm database - indexer intersection is
123 $self->{'gdbm_level'} = "section";
124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140 $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 $self->{'num_bytes'} = 0;
149
150 $self->{'num_processed_bytes'} = 0;
151}
152
153sub is_incremental_capable
154{
155 # By default we return 'no' as the answer
156 # Safer to assume non-incremental to start with, and then override in
157 # inherited classes that are.
158
159 return 0;
160}
161
162sub get_num_docs {
163 my $self = shift (@_);
164
165 return $self->{'num_docs'};
166}
167
168sub get_num_sections {
169 my $self = shift (@_);
170
171 return $self->{'num_sections'};
172}
173
174# num_bytes is the actual number of bytes in the collection
175# this is normally the same as what's processed during text compression
176sub get_num_bytes {
177 my $self = shift (@_);
178
179 return $self->{'num_bytes'};
180}
181
182# num_processed_bytes is the number of bytes actually passed
183# to mg for the current index
184sub get_num_processed_bytes {
185 my $self = shift (@_);
186
187 return $self->{'num_processed_bytes'};
188}
189
190sub set_output_handle {
191 my $self = shift (@_);
192 my ($handle) = @_;
193
194 $self->{'output_handle'} = $handle;
195}
196
197
198sub set_mode {
199 my $self = shift (@_);
200 my ($mode) = @_;
201
202 $self->{'mode'} = $mode;
203}
204
205sub get_mode {
206 my $self = shift (@_);
207
208 return $self->{'mode'};
209}
210
211sub set_assocdir {
212 my $self = shift (@_);
213 my ($assocdir) = @_;
214
215 $self->{'assocdir'} = $assocdir;
216}
217
218sub set_dontgdbm {
219 my $self = shift (@_);
220 my ($dontgdbm) = @_;
221
222 $self->{'dontgdbm'} = $dontgdbm;
223}
224
225sub set_index {
226 my $self = shift (@_);
227 my ($index, $indexexparr) = @_;
228
229 $self->{'index'} = $index;
230 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231}
232
233sub set_index_languages {
234 my $self = shift (@_);
235 my ($lang_meta, $langarr) = @_;
236 $self->{'lang_meta'} = $lang_meta;
237 $self->{'langarr'} = $langarr;
238}
239
240sub get_index {
241 my $self = shift (@_);
242
243 return $self->{'index'};
244}
245
246sub set_classifiers {
247 my $self = shift (@_);
248 my ($classifiers) = @_;
249
250 $self->{'classifiers'} = $classifiers;
251}
252
253sub set_indexing_text {
254 my $self = shift (@_);
255 my ($indexing_text) = @_;
256
257 $self->{'indexing_text'} = $indexing_text;
258}
259
260sub get_indexing_text {
261 my $self = shift (@_);
262
263 return $self->{'indexing_text'};
264}
265
266sub set_store_text {
267 my $self = shift (@_);
268 my ($store_text) = @_;
269
270 $self->{'store_text'} = $store_text;
271}
272sub get_doc_list {
273 my $self = shift(@_);
274
275 return @{$self->{'doclist'}};
276}
277
278# the standard gdbm level is section, but you may want to change it to document
279sub set_gdbm_level {
280 my $self= shift (@_);
281 my ($gdbm_level) = @_;
282
283 $self->{'gdbm_level'} = $gdbm_level;
284}
285
286sub set_sections_index_document_metadata {
287 my $self= shift (@_);
288 my ($index_type) = @_;
289
290 $self->{'sections_index_document_metadata'} = $index_type;
291}
292sub process {
293 my $self = shift (@_);
294 my $method = $self->{'mode'};
295
296 $self->$method(@_);
297}
298
299sub infodb {
300 my $self = shift (@_);
301 my ($doc_obj, $filename) = @_;
302 my $handle = $self->{'output_handle'};
303
304 my $doctype = $doc_obj->get_doc_type();
305
306 # only output this document if it is a "indexed_doc" or "info_doc" (GDBM database only) document
307 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
308
309 my $archivedir = "";
310
311 if (defined $filename)
312 {
313 # doc_obj derived directly from file
314
315 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
316 $dir = "" unless defined $dir;
317 $dir =~ s/\\/\//g;
318 $dir =~ s/^\/+//;
319 $dir =~ s/\/+$//;
320
321 $archivedir = $dir;
322
323 # resolve the final filenames of the files associated with this document
324 $self->assoc_files ($doc_obj, $archivedir);
325 }
326 else
327 {
328 # doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
329 my $top_section = $doc_obj->get_top_section();
330 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
331 }
332
333
334 #GRB: moved 1/06/2004 from GRB01062004
335 #add this document to the browse structure
336 push(@{$self->{'doclist'}},$doc_obj->get_OID())
337 unless ($doctype eq "classification");
338
339 # classify this document
340 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
341 #GRB: end of moved block
342
343 # this is another document
344 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
345
346 # is this a paged or a hierarchical document
347 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
348
349 my $section = $doc_obj->get_top_section ();
350 my $doc_OID = $doc_obj->get_OID();
351 my $first = 1;
352 my $url = "";
353 while (defined $section) {
354 # update a few statistics
355 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
356 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
357
358 # output the section name
359 if ($section eq "") { print $handle "[$doc_OID]\n"; }
360 else { print $handle "[$doc_OID.$section]\n"; }
361
362 # output the fact that this document is a document (unless doctype
363 # has been set to something else from within a plugin
364 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
365 if (!defined $dtype || $dtype !~ /\w/) {
366 print $handle "<doctype>doc\n";
367 }
368
369 # Output whether this node contains text
370 #
371 # If doc_obj reconstructed from GDBM file then no need to
372 # explicitly add <hastxt> as this is preserved as metadata when
373 # the GDBM file is loaded in
374
375 if (defined $filename)
376 {
377 # doc_obj derived directly from file
378 if ($doc_obj->get_text_length($section) > 0) {
379 print $handle "<hastxt>1\n";
380 } else {
381 print $handle "<hastxt>0\n";
382 }
383 }
384
385 # output all the section metadata
386 my $metadata = $doc_obj->get_all_metadata ($section);
387 foreach my $pair (@$metadata) {
388 my ($field, $value) = (@$pair);
389
390 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
391 defined $value && $value ne "") {
392
393 # escape problematic stuff
394 $value =~ s/\\/\\\\/g;
395 $value =~ s/\n/\\n/g;
396 $value =~ s/\r/\\r/g;
397 if ($value =~ /-{70,}/) {
398 # if value contains 70 or more hyphens in a row we need
399 # to escape them to prevent txt2db from treating them
400 # as a separator
401 $value =~ s/-/&\#045;/gi;
402 }
403
404 # special case for URL metadata
405 if ($field =~ /^URL$/i) {
406 $url .= "[$value]\n";
407 if ($section eq "") {$url .= "<section>$doc_OID\n";}
408 else {$url .= "<section>$doc_OID.$section\n";}
409 $url .= '-' x 70 . "\n";
410 }
411
412 if (!defined $self->{'dontgdbm'}->{$field}) {
413 print $handle "<$field>$value\n";
414 }
415 }
416 }
417
418
419 # If doc_obj reconstructed from GDBM file then no need to
420 # explicitly add <archivedir> as this is preserved as metadata when
421 # the GDBM file is loaded in
422
423 if (defined $filename)
424 {
425 # output archivedir if at top level
426 if ($section eq $doc_obj->get_top_section()) {
427 print $handle "<archivedir>$archivedir\n";
428 }
429 }
430
431 # output document display type
432 if ($first) {
433 print $handle "<thistype>$thistype\n";
434 }
435
436
437 if ($self->{'gdbm_level'} eq "document") {
438 # doc num is num_docs not num_sections
439 # output the matching document number
440 print $handle "<docnum>$self->{'num_docs'}\n";
441
442 } else {
443 # output a list of children
444 my $children = $doc_obj->get_children ($section);
445 if (scalar(@$children) > 0) {
446 print $handle "<childtype>$childtype\n";
447 print $handle "<contains>";
448 my $firstchild = 1;
449 foreach my $child (@$children) {
450 print $handle ";" unless $firstchild;
451 $firstchild = 0;
452 if ($child =~ /^.*?\.(\d+)$/) {
453 print $handle "\".$1";
454 } else {
455 print $handle "\".$child";
456 }
457# if ($child eq "") { print $handle "$doc_OID"; }
458# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
459# else { print $handle "$doc_OID.$section.$child"; }
460 }
461 print $handle "\n";
462 }
463 #output the matching doc number
464 print $handle "<docnum>$self->{'num_sections'}\n";
465
466 }
467
468 print $handle '-' x 70, "\n";
469
470
471 # output a database entry for the document number
472 if ($self->{'gdbm_level'} eq "document") {
473 print $handle "[$self->{'num_docs'}]\n";
474 print $handle "<section>$doc_OID\n";
475 }
476 else {
477 print $handle "[$self->{'num_sections'}]\n";
478 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
479 else { print $handle "<section>$doc_OID.$section\n"; }
480 }
481 print $handle '-' x 70, "\n";
482
483 # output entry for url
484 if ($url ne "") {
485 print $handle $url;
486 }
487
488 $first = 0;
489 $section = $doc_obj->get_next_section($section);
490 last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
491 }
492
493 #GRB01062004: see code above moved from here
494}
495
496
497sub text {
498 my $self = shift (@_);
499 my ($doc_obj) = @_;
500
501 my $handle = $self->{'outhandle'};
502 print $handle "basebuildproc::text function must be implemented in sub classes\n";
503 die "\n";
504}
505
506# should the document be indexed - according to the subcollection and language
507# specification.
508sub is_subcollection_doc {
509 my $self = shift (@_);
510 my ($doc_obj) = @_;
511
512 my $indexed_doc = 1;
513 foreach my $indexexp (@{$self->{'indexexparr'}}) {
514 $indexed_doc = 0;
515 my ($field, $exp, $options) = split /\//, $indexexp;
516 if (defined ($field) && defined ($exp)) {
517 my ($bool) = $field =~ /^(.)/;
518 $field =~ s/^.// if $bool eq '!';
519 my @metadata_values;
520 if ($field =~ /^filename$/i) {
521 push(@metadata_values, $doc_obj->get_source_filename());
522 }
523 else {
524 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
525 }
526 next unless @metadata_values;
527 foreach my $metadata_value (@metadata_values) {
528 if ($bool eq '!') {
529 if ($options =~ /^i$/i) {
530 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
531 } else {
532 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
533 }
534 } else {
535 if ($options =~ /^i$/i) {
536 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
537 } else {
538 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
539 }
540 }
541 }
542
543 last if ($indexed_doc == 1);
544 }
545 }
546
547 # if this doc is so far in the sub collection, and we have lang info,
548 # now we check the languages to see if it matches
549 if($indexed_doc && defined $self->{'lang_meta'}) {
550 $indexed_doc = 0;
551 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
552 if (defined $field) {
553 foreach my $lang (@{$self->{'langarr'}}) {
554 my ($bool) = $lang =~ /^(.)/;
555 if ($bool eq '!') {
556 $lang =~ s/^.//;
557 if ($field !~ /$lang/) {
558 $indexed_doc = 1; last;
559 }
560 } else {
561 if ($field =~ /$lang/) {
562 $indexed_doc = 1; last;
563 }
564 }
565 }
566 }
567 }
568 return $indexed_doc;
569
570}
571
572# use 'Paged' if document has no more than 2 levels
573# and each section at second level has a number for
574# Title metadata
575# also use Paged if gsdlthistype metadata is set to Paged
576sub get_document_type {
577 my $self = shift (@_);
578 my ($doc_obj) = @_;
579
580 my $thistype = "VList";
581 my $childtype = "VList";
582 my $title;
583 my @tmp = ();
584
585 my $section = $doc_obj->get_top_section ();
586
587 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
588 if (defined $gsdlthistype) {
589 if ($gsdlthistype eq "Paged") {
590 $childtype = "Paged";
591 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
592 $thistype = "Paged";
593 } else {
594 $thistype = "Invisible";
595 }
596
597 return ($thistype, $childtype);
598 } elsif ($gsdlthistype eq "Hierarchy") {
599 return ($thistype, $childtype); # use VList, VList
600 }
601 }
602 my $first = 1;
603 while (defined $section) {
604 @tmp = split /\./, $section;
605 if (scalar(@tmp) > 1) {
606 return ($thistype, $childtype);
607 }
608 if (!$first) {
609 $title = $doc_obj->get_metadata_element ($section, "Title");
610 if (!defined $title || $title !~ /^\d+$/) {
611 return ($thistype, $childtype);
612 }
613 }
614 $first = 0;
615 $section = $doc_obj->get_next_section($section);
616 }
617 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
618 $thistype = "Paged";
619 } else {
620 $thistype = "Invisible";
621 }
622 $childtype = "Paged";
623 return ($thistype, $childtype);
624}
625
626sub assoc_files() {
627 my $self = shift (@_);
628 my ($doc_obj, $archivedir) = @_;
629 my ($afile);
630
631 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
632 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
633 # if assoc file starts with a slash, we put it relative to the assoc
634 # dir, otherwise it is relative to the HASH... directory
635 if ($assoc_file->[1] =~ m@^[/\\]@) {
636 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
637 } else {
638 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
639 }
640 &util::hard_link ($assoc_file->[0], $afile);
641 }
642}
643
Note: See TracBrowser for help on using the repository browser.