source: trunk/gsdl/perllib/mgppbuildproc.pm@ 5608

Last change on this file since 5608 was 4811, checked in by kjdon, 21 years ago

levels are now specified using upper or lower case, eg Section or section. if levels aren't specified, use document, otherwise use only what is specified eg levels section will only give section level. we use Doc, Sec, Para when passing the stuff to mgpp. the build.cfg file now contains indexlevels and textlevel entries - these give the actual names used by mgpp, and mean that the c++ code no longer has to assume them. collection meta can be specified for the levels, otherwise _textdocument_, _textsection_ and _textparagraph_ will be used.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.0 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32eval {require bytes};
33
34use classify;
35use doc;
36use docproc;
37use util;
38
39
40BEGIN {
41 @ISA = ('docproc');
42}
43
44#this must be the same as in mgppbuilder
45%level_map = ('document'=>'Doc',
46 'section'=>'Sec',
47 'paragraph'=>'Para');
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir,
51 $verbosity, $outhandle) = @_;
52 my $self = new docproc ();
53
54 # outhandle is where all the debugging info goes
55 # output_handle is where the output of the plugins is piped
56 # to (i.e. mg, gdbm etc.)
57 $outhandle = STDERR unless defined $outhandle;
58
59 $self->{'collection'} = $collection;
60 $self->{'source_dir'} = $source_dir;
61 $self->{'build_dir'} = $build_dir;
62 $self->{'verbosity'} = $verbosity;
63 $self->{'classifiers'} = [];
64 $self->{'mode'} = "text";
65 $self->{'assocdir'} = $build_dir;
66 $self->{'dontgdbm'} = {};
67 $self->{'index'} = "text";
68 $self->{'indexexparr'} = [];
69 $self->{'output_handle'} = "STDOUT";
70 $self->{'num_docs'} = 0;
71 $self->{'num_sections'} = 0;
72 $self->{'num_bytes'} = 0;
73 $self->{'num_processed_bytes'} = 0;
74 $self->{'store_text'} = 1;
75 $self->{'outhandle'} = $outhandle;
76
77 #used by browse interface
78 $self->{'doclist'} = [];
79
80 $self->{'indexing_text'} = 0;
81
82 #new ones for mgpp
83 $self->{'dontindex'} = {};
84 $self->{'indexfieldmap'} = {};
85 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
86 $self->{'strip_html'}=1;
87
88
89 return bless $self, $class;
90}
91
92sub reset {
93 my $self = shift (@_);
94
95 $self->{'num_docs'} = 0;
96 $self->{'num_sections'} = 0;
97 $self->{'num_processed_bytes'} = 0;
98 $self->{'num_bytes'} = 0;
99}
100
101sub get_num_docs {
102 my $self = shift (@_);
103
104 return $self->{'num_docs'};
105}
106
107sub get_num_sections {
108 my $self = shift (@_);
109
110 return $self->{'num_sections'};
111}
112
113# num_bytes is the actual number of bytes in the collection
114# this is normally the same as what's processed during text compression
115sub get_num_bytes {
116 my $self = shift (@_);
117
118 return $self->{'num_bytes'};
119}
120
121# num_processed_bytes is the number of bytes actually passed
122# to mgpp for the current index
123sub get_num_processed_bytes {
124 my $self = shift (@_);
125
126 return $self->{'num_processed_bytes'};
127}
128
129sub set_output_handle {
130 my $self = shift (@_);
131 my ($handle) = @_;
132
133 $self->{'output_handle'} = $handle;
134}
135
136sub set_mode {
137 my $self = shift (@_);
138 my ($mode) = @_;
139
140 $self->{'mode'} = $mode;
141}
142
143sub set_assocdir {
144 my $self = shift (@_);
145 my ($assocdir) = @_;
146
147 $self->{'assocdir'} = $assocdir;
148}
149
150sub set_dontgdbm {
151 my $self = shift (@_);
152 my ($dontgdbm) = @_;
153
154 $self->{'dontgdbm'} = $dontgdbm;
155}
156
157sub set_index {
158 my $self = shift (@_);
159 my ($index, $indexexparr) = @_;
160
161 $self->{'index'} = $index;
162 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
163}
164
165sub get_index {
166 my $self = shift (@_);
167
168 return $self->{'index'};
169}
170
171sub set_classifiers {
172 my $self = shift (@_);
173 my ($classifiers) = @_;
174
175 $self->{'classifiers'} = $classifiers;
176}
177
178sub set_indexing_text {
179 my $self = shift (@_);
180 my ($indexing_text) = @_;
181
182 $self->{'indexing_text'} = $indexing_text;
183}
184
185sub get_indexing_text {
186 my $self = shift (@_);
187
188 return $self->{'indexing_text'};
189}
190
191sub set_store_text {
192 my $self = shift (@_);
193 my ($store_text) = @_;
194
195 $self->{'store_text'} = $store_text;
196}
197
198sub get_doc_list {
199 my $self = shift(@_);
200
201 return @{$self->{'doclist'}};
202}
203
204sub set_indexfieldmap {
205 my $self = shift (@_);
206 my ($indexmap) = @_;
207
208 $self->{'indexfieldmap'} = $indexmap;
209}
210
211sub get_indexfieldmap {
212 my $self = shift (@_);
213
214 return $self->{'indexfieldmap'};
215}
216
217sub set_levels {
218 my $self = shift (@_);
219 my ($levels) = @_;
220
221 $self->{'levels'} = $levels;
222}
223
224sub set_strip_html {
225 my $self = shift (@_);
226 my ($strip) = @_;
227 $self->{'strip_html'}=$strip;
228}
229
230sub process {
231 my $self = shift (@_);
232 my $method = $self->{'mode'};
233
234 $self->$method(@_);
235}
236
237# use 'Paged' if document has no more than 2 levels
238# and each section at second level has a number for
239# Title metadata
240sub get_document_type {
241 my $self = shift (@_);
242 my ($doc_obj) = @_;
243
244 my $thistype = "VList";
245 my $childtype = "VList";
246 my $title;
247 my @tmp = ();
248
249 my $section = $doc_obj->get_top_section ();
250 my $first = 1;
251 while (defined $section) {
252 @tmp = split /\./, $section;
253 if (scalar(@tmp) > 1) {
254 return ($thistype, $childtype);
255 }
256 if (!$first) {
257 $title = $doc_obj->get_metadata_element ($section, "Title");
258 if (!defined $title || $title !~ /^\d+$/) {
259 return ($thistype, $childtype);
260 }
261 }
262 $first = 0;
263 $section = $doc_obj->get_next_section($section);
264 }
265 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
266 $thistype = "Paged";
267 } else {
268 $thistype = "Invisible";
269 }
270 $childtype = "Paged";
271 return ($thistype, $childtype);
272}
273
274sub assoc_files {
275 my $self = shift (@_);
276 my ($doc_obj, $archivedir) = @_;
277 my ($afile);
278
279 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
280 # if assoc file starts with a slash, we put it relative to the assoc
281 # dir, otherwise it is relative to the HASH... directory
282 if ($assoc_file->[1] =~ m@^[/\\]@) {
283 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
284 } else {
285 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
286 }
287 &util::hard_link ($assoc_file->[0], $afile);
288 }
289}
290
291sub infodb {
292 my $self = shift (@_);
293 my ($doc_obj, $filename) = @_;
294 my $handle = $self->{'output_handle'};
295
296 my $doctype = $doc_obj->get_doc_type();
297
298 # only output this document if it is one to be indexed
299 return if ($doctype ne "indexed_doc");
300
301 #if a Section level index is not built, the gdbm file should be at doc
302 #level not Section
303 my $docs_only = 1;
304 if ($self->{'levels'}->{'section'}) {
305 $docs_only = 0;
306 }
307
308 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
309 $archivedir = "" unless defined $archivedir;
310 $archivedir =~ s/\\/\//g;
311 $archivedir =~ s/^\/+//;
312 $archivedir =~ s/\/+$//;
313
314 $self->assoc_files ($doc_obj, $archivedir);
315
316 # this is another document
317 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
318
319 # is this a paged or a hierarchical document
320 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
321
322 my $section = $doc_obj->get_top_section ();
323 my $doc_OID = $doc_obj->get_OID();
324 my $first = 1;
325 my $url = "";
326 while (defined $section) {
327 # update a few statistics
328 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
329 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
330
331 # output the section name
332 if ($section eq "") { print $handle "[$doc_OID]\n"; }
333 else { print $handle "[$doc_OID.$section]\n"; }
334
335 # output the fact that this document is a document (unless doctype
336 # has been set to something else from within a plugin
337 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
338 if (!defined $dtype || $dtype !~ /\w/) {
339 print $handle "<doctype>doc\n";
340 }
341
342 # output whether this node contains text
343 if ($doc_obj->get_text_length($section) > 0) {
344 print $handle "<hastxt>1\n";
345 } else {
346 print $handle "<hastxt>0\n";
347 }
348
349 # output all the section metadata
350 my $metadata = $doc_obj->get_all_metadata ($section);
351 foreach my $pair (@$metadata) {
352 my ($field, $value) = (@$pair);
353
354 if ($field !~ /^(?:Identifier|Encoding|Language|gsdl)/
355 && defined $value && $value ne "") {
356
357 # escape problematic stuff
358 $value =~ s/\\/\\\\/g;
359 $value =~ s/\n/\\n/g;
360 $value =~ s/\r/\\r/g;
361
362 # special case for URL metadata
363 if ($field =~ /^URL$/i) {
364 $url .= "[$value]\n";
365 if ($section eq "") {$url .= "<section>$doc_OID\n";}
366 else {$url .= "<section>$doc_OID.$section\n";}
367 $url .= '-' x 70 . "\n";
368 }
369
370 if (!defined $self->{'dontgdbm'}->{$field}) {
371 print $handle "<$field>$value\n";
372 }
373 }
374 }
375
376 # output archivedir if at top level
377 if ($section eq $doc_obj->get_top_section()) {
378 print $handle "<archivedir>$archivedir\n";
379 }
380
381 # output document display type
382 if ($first) {
383 print $handle "<thistype>$thistype\n";
384 }
385
386 if (!$docs_only) {
387 # output a list of children
388 my $children = $doc_obj->get_children ($section);
389 if (scalar(@$children) > 0) {
390 print $handle "<childtype>$childtype\n";
391 print $handle "<contains>";
392 my $firstchild = 1;
393 foreach $child (@$children) {
394 print $handle ";" unless $firstchild;
395 $firstchild = 0;
396 if ($child =~ /^.*?\.(\d+)$/) {
397 print $handle "\".$1";
398 } else {
399 print $handle "\".$child";
400 }
401# if ($child eq "") { print $handle "$doc_OID"; }
402# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
403# else { print $handle "$doc_OID.$section.$child"; }
404 }
405 print $handle "\n";
406 }
407 #output the matching doc number
408 print $handle "<docnum>$self->{'num_sections'}\n";
409
410 } # if (!$docs_only)
411 else { #docs only, doc num is num_docs not num_sections
412 # output the matching document number
413 print $handle "<docnum>$self->{'num_docs'}\n";
414 }
415
416 print $handle '-' x 70, "\n";
417
418
419 # output a database entry for the document number
420 if ($docs_only) {
421 print $handle "[$self->{'num_docs'}]\n";
422 print $handle "<section>$doc_OID\n";
423 }
424 else {
425 print $handle "[$self->{'num_sections'}]\n";
426 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
427 else { print $handle "<section>$doc_OID.$section\n"; }
428 }
429 print $handle '-' x 70, "\n";
430
431 # output entry for url
432 if ($url ne "") {
433 print $handle $url;
434 }
435
436 $first = 0;
437 $section = $doc_obj->get_next_section($section);
438 last if ($docs_only); # if no sections wanted, only gdbm the docs
439 }
440
441 #add this document to the browse structure
442 push(@{$self->{'doclist'}},$doc_obj->get_OID())
443 unless ($doctype eq "classification");
444
445 # classify this document
446 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
447
448}
449
450#sub find_paragraphs {
451# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
452#}
453
454#this function strips the html tags from the doc if ($strip_html) and
455# if ($para) replaces <p> with <Paragraph> tags.
456# if both are false, the original text is returned
457#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
458#these tags
459sub preprocess_text {
460 my $self = shift (@_);
461 my ($text, $strip_html, $para) = @_;
462 my ($outtext) = "";
463 if ($strip_html) {
464 while ($text =~ /<([^>]*)>/ && $text ne "") {
465
466 $tag = $1;
467 $outtext .= $`." "; #add everything before the matched tag
468 $text = $'; #everything after the matched tag
469 if ($para && $tag =~ /^\s*p\s/i) {
470 $outtext .= $para;
471 }
472 elsif ($tag =~ /^pre$/) { # a pre tag
473 $text =~ /<\/pre>/; # find the closing pre tag
474 my $tmp_text = $`; #everything before the closing pre tag
475 $text = $'; #everything after the </pre>
476 $tmp_text =~ s/[<>]//g; # remove all < and >
477 $outtext.= $tmp_text . " ";
478 }
479 }
480
481 $outtext .= $text; # add any remaining text
482 return $outtext;
483 } #if strip_html
484
485 #if ($para) {
486 #$text =~ s/(<p\b)/$para$1/gi;
487 #return $text;
488 # }
489 return $text;
490}
491
492
493
494sub filter_text {
495 # $self->filter_text ($field, $new_text);
496 # don't want to do anything for this version, however,
497 # in a particular collection you might want to override
498 # this method to post-process certain fields depending on
499 # the field, or whether we are outputting it for indexing
500}
501
502sub text {
503 my $self = shift (@_);
504 my ($doc_obj) = @_;
505 my $handle = $self->{'output_handle'};
506 my $outhandle = $self->{'outhandle'};
507 my $indexed_doc = 1;
508
509 # only output this document if it is one to be indexed
510 return if ($doc_obj->get_doc_type() ne "indexed_doc");
511
512 # see if this document belongs to this subcollection
513 foreach my $indexexp (@{$self->{'indexexparr'}}) {
514 $indexed_doc = 0;
515 my ($field, $exp, $options) = split /\//, $indexexp;
516 if (defined ($field) && defined ($exp)) {
517 my ($bool) = $field =~ /^(.)/;
518 $field =~ s/^.// if $bool eq '!';
519 if ($field =~ /^filename$/i) {
520 $field = $doc_obj->get_source_filename();
521 } else {
522 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
523 }
524 next unless defined $field;
525 if ($bool eq '!') {
526 if ($options =~ /^i$/i) {
527 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
528 } else {
529 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
530 }
531 } else {
532 if ($options =~ /^i$/i) {
533 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
534 } else {
535 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
536 }
537 }
538 }
539 }
540
541 # this is another document
542 $self->{'num_docs'} += 1;
543
544 # get the parameters for the output
545 my ($fields) = $self->{'index'};
546
547 my ($documenttag) = "";
548 my($documentendtag) = "";
549 if ($self->{'levels'}->{'document'}) {
550 $documenttag = "\n<". %level_map->{'document'} . ">\n";
551 $documentendtag = "\n</". %level_map->{'document'} . ">\n";
552 }
553 my ($sectiontag) = "";
554 if ($self->{'levels'}->{'section'}) {
555 $sectiontag = "\n<". %level_map->{'section'} . ">\n";
556 }
557 my ($paratag) = "";
558 if ($self->{'levels'}->{'paragraph'}) {
559 if ($self->{'strip_html'}) {
560 $paratag = "<". %level_map->{'paragraph'} . ">";
561 } else {
562 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
563 }
564 }
565
566 my $doc_section = 0; # just for this document
567
568 my $text = $documenttag;
569
570 # get the text for this document
571 my $section = $doc_obj->get_top_section();
572 while (defined $section) {
573 # update a few statistics
574 $doc_section++;
575 $self->{'num_sections'} += 1;
576 $text .= "$sectiontag";
577
578 if ($indexed_doc) {
579 if ($self->{'indexing_text'}) {
580 $text .= "$paratag"; # only add para tags for indexing
581 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
582 }
583 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
584 foreach my $field (split (/,/, $fields)) {
585 # only deal with this field if it doesn't start with top or
586 # this is the first section
587 my $real_field = $field;
588 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
589 my $new_text = "";
590 my $tmp_text = "";
591 if ($real_field eq "text") {
592 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
593 $new_text .= "$paratag<TX>\n";
594 $tmp_text .= $doc_obj->get_text ($section);
595 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
596
597 $new_text .= "$tmp_text</TX>\n";
598 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
599 #$self->{'indexfields'}->{'TextOnly'} = 1;
600 #}
601 }
602 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
603 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
604 }
605 } else { # metadata field
606 if ($real_field eq "allfields") { #ignore
607 }
608 elsif ($real_field eq "metadata") { # insert all metadata
609 #except gsdl stuff
610 my $shortname = "";
611 my $metadata = $doc_obj->get_all_metadata ($section);
612 foreach $pair (@$metadata) {
613 my ($mfield, $mvalue) = (@$pair);
614 # check fields here, maybe others dont want - change to use dontindex!!
615 if ($mfield !~ /^(?:Identifier|Encoding|Language|gsdl)/
616 && $mfield ne "classifytype"
617 && $mfield ne "assocfilepath"
618 && defined $mvalue && $mvalue ne "") {
619
620 if (defined $self->{'indexfieldmap'}->{$mfield}) {
621 $shortname = $self->{'indexfieldmap'}->{$mfield};
622 }
623 else {
624 $shortname = $self->create_shortname($mfield);
625 $self->{'indexfieldmap'}->{$mfield} = $shortname;
626 $self->{'indexfieldmap'}->{$shortname} = 1;
627 }
628 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
629 if (!defined $self->{'indexfields'}->{$mfield}) {
630 $self->{'indexfields'}->{$mfield} = 1;
631 }
632 }
633 }
634
635 }
636 else { #individual metadata specified
637 my $shortname="";
638 #if (!defined $self->{'indexfields'}->{$real_field}) {
639 #$self->{'indexfields'}->{$real_field} = 1;
640 #}
641 if (defined $self->{'indexfieldmap'}->{$real_field}) {
642 $shortname = $self->{'indexfieldmap'}->{$real_field};
643 }
644 else {
645 $shortname = $self->create_shortname($real_field);
646 $self->{'indexfieldmap'}->{$real_field} = $shortname;
647 $self->{'indexfieldmap'}->{$shortname} = 1;
648 }
649 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
650 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
651 }
652 }
653
654 }
655
656 # filter the text
657 $self->filter_text ($field, $new_text);
658
659 $self->{'num_processed_bytes'} += length ($new_text);
660 $text .= "$new_text";
661 }
662 }
663 } # if (indexed_doc)
664
665 $section = $doc_obj->get_next_section($section);
666 } #while defined section
667 print $handle "$text\n$documentendtag";
668
669}
670
671#chooses the first two letters or digits for the shortname
672#now ignores non-letdig characters
673sub create_shortname {
674 $self = shift(@_);
675
676 my ($realname) = @_;
677 #take the first two chars
678 my $shortname;
679 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
680 $shortname = "$1$2";
681 } else {
682 # there aren't two letdig's in the field - try arbitrary combinations
683 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
684 $shortname = "AB";
685 }
686 $shortname =~ tr/a-z/A-Z/;
687
688 #if already used, take the first and third letdigs and so on
689 $count = 1;
690 while (defined $self->{'indexfieldmap'}->{$shortname}) {
691 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
692 $shortname = "$1$3";
693 $count++;
694 $shortname =~ tr/a-z/A-Z/;
695
696 }
697 else {
698 #remove up to and incl the first letdig
699 $realname =~ s/^[^\w]*\w//;
700 $count = 0;
701 }
702 }
703
704 return $shortname;
705}
706
7071;
708
Note: See TracBrowser for help on using the repository browser.