source: trunk/gsdl/perllib/mgppbuildproc.pm@ 7645

Last change on this file since 7645 was 7090, checked in by sjboddie, 20 years ago

Fixed some minor perl syntax problems that were causing warning messages
when building mgpp collections.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.4 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32eval {require bytes};
33
34use classify;
35use doc;
36use docproc;
37use util;
38
39
40BEGIN {
41 @ISA = ('docproc');
42}
43
44#this must be the same as in mgppbuilder
45%level_map = ('document'=>'Doc',
46 'section'=>'Sec',
47 'paragraph'=>'Para');
48
49sub new {
50 my ($class, $collection, $source_dir, $build_dir,
51 $verbosity, $outhandle) = @_;
52 my $self = new docproc ();
53
54 # outhandle is where all the debugging info goes
55 # output_handle is where the output of the plugins is piped
56 # to (i.e. mg, gdbm etc.)
57 $outhandle = STDERR unless defined $outhandle;
58
59 $self->{'collection'} = $collection;
60 $self->{'source_dir'} = $source_dir;
61 $self->{'build_dir'} = $build_dir;
62 $self->{'verbosity'} = $verbosity;
63 $self->{'classifiers'} = [];
64 $self->{'mode'} = "text";
65 $self->{'assocdir'} = $build_dir;
66 $self->{'dontgdbm'} = {};
67 $self->{'index'} = "text";
68 $self->{'indexexparr'} = [];
69 $self->{'output_handle'} = "STDOUT";
70 $self->{'num_docs'} = 0;
71 $self->{'num_sections'} = 0;
72 $self->{'num_bytes'} = 0;
73 $self->{'num_processed_bytes'} = 0;
74 $self->{'store_text'} = 1;
75 $self->{'outhandle'} = $outhandle;
76
77 #used by browse interface
78 $self->{'doclist'} = [];
79
80 $self->{'indexing_text'} = 0;
81
82 #new ones for mgpp
83 $self->{'dontindex'} = {};
84 $self->{'indexfieldmap'} = {};
85 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
86 $self->{'strip_html'}=1;
87
88
89 return bless $self, $class;
90}
91
92sub reset {
93 my $self = shift (@_);
94
95 $self->{'num_docs'} = 0;
96 $self->{'num_sections'} = 0;
97 $self->{'num_processed_bytes'} = 0;
98 $self->{'num_bytes'} = 0;
99}
100
101sub get_num_docs {
102 my $self = shift (@_);
103
104 return $self->{'num_docs'};
105}
106
107sub get_num_sections {
108 my $self = shift (@_);
109
110 return $self->{'num_sections'};
111}
112
113# num_bytes is the actual number of bytes in the collection
114# this is normally the same as what's processed during text compression
115sub get_num_bytes {
116 my $self = shift (@_);
117
118 return $self->{'num_bytes'};
119}
120
121# num_processed_bytes is the number of bytes actually passed
122# to mgpp for the current index
123sub get_num_processed_bytes {
124 my $self = shift (@_);
125
126 return $self->{'num_processed_bytes'};
127}
128
129sub set_output_handle {
130 my $self = shift (@_);
131 my ($handle) = @_;
132
133 $self->{'output_handle'} = $handle;
134}
135
136sub set_mode {
137 my $self = shift (@_);
138 my ($mode) = @_;
139
140 $self->{'mode'} = $mode;
141}
142
143sub set_assocdir {
144 my $self = shift (@_);
145 my ($assocdir) = @_;
146
147 $self->{'assocdir'} = $assocdir;
148}
149
150sub set_dontgdbm {
151 my $self = shift (@_);
152 my ($dontgdbm) = @_;
153
154 $self->{'dontgdbm'} = $dontgdbm;
155}
156
157sub set_index {
158 my $self = shift (@_);
159 my ($index, $indexexparr) = @_;
160
161 $self->{'index'} = $index;
162 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
163}
164
165sub get_index {
166 my $self = shift (@_);
167
168 return $self->{'index'};
169}
170
171sub set_classifiers {
172 my $self = shift (@_);
173 my ($classifiers) = @_;
174
175 $self->{'classifiers'} = $classifiers;
176}
177
178sub set_indexing_text {
179 my $self = shift (@_);
180 my ($indexing_text) = @_;
181
182 $self->{'indexing_text'} = $indexing_text;
183}
184
185sub get_indexing_text {
186 my $self = shift (@_);
187
188 return $self->{'indexing_text'};
189}
190
191sub set_store_text {
192 my $self = shift (@_);
193 my ($store_text) = @_;
194
195 $self->{'store_text'} = $store_text;
196}
197
198sub get_doc_list {
199 my $self = shift(@_);
200
201 return @{$self->{'doclist'}};
202}
203
204sub set_indexfieldmap {
205 my $self = shift (@_);
206 my ($indexmap) = @_;
207
208 $self->{'indexfieldmap'} = $indexmap;
209}
210
211sub get_indexfieldmap {
212 my $self = shift (@_);
213
214 return $self->{'indexfieldmap'};
215}
216
217sub set_levels {
218 my $self = shift (@_);
219 my ($levels) = @_;
220
221 $self->{'levels'} = $levels;
222}
223
224sub set_strip_html {
225 my $self = shift (@_);
226 my ($strip) = @_;
227 $self->{'strip_html'}=$strip;
228}
229
230sub process {
231 my $self = shift (@_);
232 my $method = $self->{'mode'};
233
234 $self->$method(@_);
235}
236
237# use 'Paged' if document has no more than 2 levels
238# and each section at second level has a number for
239# Title metadata
240# also use Paged if gsdlthistype metadata is set to Paged
241sub get_document_type {
242 my $self = shift (@_);
243 my ($doc_obj) = @_;
244
245 my $thistype = "VList";
246 my $childtype = "VList";
247 my $title;
248 my @tmp = ();
249
250 my $section = $doc_obj->get_top_section ();
251
252 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
253 if (defined $gsdlthistype) {
254 if ($gsdlthistype eq "Paged") {
255 $thistype = "Paged";
256 $childtype = "Paged";
257 return ($thistype, $childtype);
258 } elsif ($gsdlthistype eq "Hierarchy") {
259 return ($thistype, $childtype); # use VList, VList
260 }
261 }
262 my $first = 1;
263 while (defined $section) {
264 @tmp = split /\./, $section;
265 if (scalar(@tmp) > 1) {
266 return ($thistype, $childtype);
267 }
268 if (!$first) {
269 $title = $doc_obj->get_metadata_element ($section, "Title");
270 if (!defined $title || $title !~ /^\d+$/) {
271 return ($thistype, $childtype);
272 }
273 }
274 $first = 0;
275 $section = $doc_obj->get_next_section($section);
276 }
277 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
278 $thistype = "Paged";
279 } else {
280 $thistype = "Invisible";
281 }
282 $childtype = "Paged";
283 return ($thistype, $childtype);
284}
285
286sub assoc_files {
287 my $self = shift (@_);
288 my ($doc_obj, $archivedir) = @_;
289 my ($afile);
290
291 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
292 # if assoc file starts with a slash, we put it relative to the assoc
293 # dir, otherwise it is relative to the HASH... directory
294 if ($assoc_file->[1] =~ m@^[/\\]@) {
295 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
296 } else {
297 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
298 }
299 &util::hard_link ($assoc_file->[0], $afile);
300 }
301}
302
303sub infodb {
304 my $self = shift (@_);
305 my ($doc_obj, $filename) = @_;
306 my $handle = $self->{'output_handle'};
307
308 my $doctype = $doc_obj->get_doc_type();
309
310 # only output this document if it is one to be indexed
311 return if ($doctype ne "indexed_doc");
312
313 #if a Section level index is not built, the gdbm file should be at doc
314 #level not Section
315 my $docs_only = 1;
316 if ($self->{'levels'}->{'section'}) {
317 $docs_only = 0;
318 }
319
320 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
321 $archivedir = "" unless defined $archivedir;
322 $archivedir =~ s/\\/\//g;
323 $archivedir =~ s/^\/+//;
324 $archivedir =~ s/\/+$//;
325
326 $self->assoc_files ($doc_obj, $archivedir);
327
328 # this is another document
329 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
330
331 # is this a paged or a hierarchical document
332 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
333
334 my $section = $doc_obj->get_top_section ();
335 my $doc_OID = $doc_obj->get_OID();
336 my $first = 1;
337 my $url = "";
338 while (defined $section) {
339 # update a few statistics
340 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
341 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
342
343 # output the section name
344 if ($section eq "") { print $handle "[$doc_OID]\n"; }
345 else { print $handle "[$doc_OID.$section]\n"; }
346
347 # output the fact that this document is a document (unless doctype
348 # has been set to something else from within a plugin
349 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
350 if (!defined $dtype || $dtype !~ /\w/) {
351 print $handle "<doctype>doc\n";
352 }
353
354 # output whether this node contains text
355 if ($doc_obj->get_text_length($section) > 0) {
356 print $handle "<hastxt>1\n";
357 } else {
358 print $handle "<hastxt>0\n";
359 }
360
361 # output all the section metadata
362 my $metadata = $doc_obj->get_all_metadata ($section);
363 foreach my $pair (@$metadata) {
364 my ($field, $value) = (@$pair);
365
366 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
367 defined $value && $value ne "") {
368
369 # escape problematic stuff
370 $value =~ s/\\/\\\\/g;
371 $value =~ s/\n/\\n/g;
372 $value =~ s/\r/\\r/g;
373
374 # special case for URL metadata
375 if ($field =~ /^URL$/i) {
376 $url .= "[$value]\n";
377 if ($section eq "") {$url .= "<section>$doc_OID\n";}
378 else {$url .= "<section>$doc_OID.$section\n";}
379 $url .= '-' x 70 . "\n";
380 }
381
382 if (!defined $self->{'dontgdbm'}->{$field}) {
383 print $handle "<$field>$value\n";
384 }
385 }
386 }
387
388 # output archivedir if at top level
389 if ($section eq $doc_obj->get_top_section()) {
390 print $handle "<archivedir>$archivedir\n";
391 }
392
393 # output document display type
394 if ($first) {
395 print $handle "<thistype>$thistype\n";
396 }
397
398 if (!$docs_only) {
399 # output a list of children
400 my $children = $doc_obj->get_children ($section);
401 if (scalar(@$children) > 0) {
402 print $handle "<childtype>$childtype\n";
403 print $handle "<contains>";
404 my $firstchild = 1;
405 foreach $child (@$children) {
406 print $handle ";" unless $firstchild;
407 $firstchild = 0;
408 if ($child =~ /^.*?\.(\d+)$/) {
409 print $handle "\".$1";
410 } else {
411 print $handle "\".$child";
412 }
413# if ($child eq "") { print $handle "$doc_OID"; }
414# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
415# else { print $handle "$doc_OID.$section.$child"; }
416 }
417 print $handle "\n";
418 }
419 #output the matching doc number
420 print $handle "<docnum>$self->{'num_sections'}\n";
421
422 } # if (!$docs_only)
423 else { #docs only, doc num is num_docs not num_sections
424 # output the matching document number
425 print $handle "<docnum>$self->{'num_docs'}\n";
426 }
427
428 print $handle '-' x 70, "\n";
429
430
431 # output a database entry for the document number
432 if ($docs_only) {
433 print $handle "[$self->{'num_docs'}]\n";
434 print $handle "<section>$doc_OID\n";
435 }
436 else {
437 print $handle "[$self->{'num_sections'}]\n";
438 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
439 else { print $handle "<section>$doc_OID.$section\n"; }
440 }
441 print $handle '-' x 70, "\n";
442
443 # output entry for url
444 if ($url ne "") {
445 print $handle $url;
446 }
447
448 $first = 0;
449 $section = $doc_obj->get_next_section($section);
450 last if ($docs_only); # if no sections wanted, only gdbm the docs
451 }
452
453 #add this document to the browse structure
454 push(@{$self->{'doclist'}},$doc_obj->get_OID())
455 unless ($doctype eq "classification");
456
457 # classify this document
458 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
459
460}
461
462#sub find_paragraphs {
463# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
464#}
465
466#this function strips the html tags from the doc if ($strip_html) and
467# if ($para) replaces <p> with <Paragraph> tags.
468# if both are false, the original text is returned
469#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
470#these tags
471sub preprocess_text {
472 my $self = shift (@_);
473 my ($text, $strip_html, $para) = @_;
474 my ($outtext) = "";
475 if ($strip_html) {
476 while ($text =~ /<([^>]*)>/ && $text ne "") {
477
478 $tag = $1;
479 $outtext .= $`." "; #add everything before the matched tag
480 $text = $'; #everything after the matched tag
481 if ($para && $tag =~ /^\s*p\s/i) {
482 $outtext .= $para;
483 }
484 elsif ($tag =~ /^pre$/) { # a pre tag
485 $text =~ /<\/pre>/; # find the closing pre tag
486 my $tmp_text = $`; #everything before the closing pre tag
487 $text = $'; #everything after the </pre>
488 $tmp_text =~ s/[<>]//g; # remove all < and >
489 $outtext.= $tmp_text . " ";
490 }
491 }
492
493 $outtext .= $text; # add any remaining text
494 return $outtext;
495 } #if strip_html
496
497 #if ($para) {
498 #$text =~ s/(<p\b)/$para$1/gi;
499 #return $text;
500 # }
501 return $text;
502}
503
504
505
506sub filter_text {
507 # $self->filter_text ($field, $new_text);
508 # don't want to do anything for this version, however,
509 # in a particular collection you might want to override
510 # this method to post-process certain fields depending on
511 # the field, or whether we are outputting it for indexing
512}
513
514sub text {
515 my $self = shift (@_);
516 my ($doc_obj) = @_;
517 my $handle = $self->{'output_handle'};
518 my $outhandle = $self->{'outhandle'};
519 my $indexed_doc = 1;
520
521 # only output this document if it is one to be indexed
522 return if ($doc_obj->get_doc_type() ne "indexed_doc");
523
524 # see if this document belongs to this subcollection
525 foreach my $indexexp (@{$self->{'indexexparr'}}) {
526 $indexed_doc = 0;
527 my ($field, $exp, $options) = split /\//, $indexexp;
528 if (defined ($field) && defined ($exp)) {
529 my ($bool) = $field =~ /^(.)/;
530 $field =~ s/^.// if $bool eq '!';
531 if ($field =~ /^filename$/i) {
532 $field = $doc_obj->get_source_filename();
533 } else {
534 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
535 }
536 next unless defined $field;
537 if ($bool eq '!') {
538 if ($options =~ /^i$/i) {
539 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
540 } else {
541 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
542 }
543 } else {
544 if ($options =~ /^i$/i) {
545 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
546 } else {
547 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
548 }
549 }
550 }
551 }
552
553 # this is another document
554 $self->{'num_docs'} += 1;
555
556 # get the parameters for the output
557 # split on : just in case there is subcoll and lang stuff
558 my ($fields) = split (/:/, $self->{'index'});
559
560 my ($documenttag) = "";
561 my($documentendtag) = "";
562 if ($self->{'levels'}->{'document'}) {
563 $documenttag = "\n<". $level_map{'document'} . ">\n";
564 $documentendtag = "\n</". $level_map{'document'} . ">\n";
565 }
566 my ($sectiontag) = "";
567 if ($self->{'levels'}->{'section'}) {
568 $sectiontag = "\n<". $level_map{'section'} . ">\n";
569 }
570 my ($paratag) = "";
571 if ($self->{'levels'}->{'paragraph'}) {
572 if ($self->{'strip_html'}) {
573 $paratag = "<". $level_map{'paragraph'} . ">";
574 } else {
575 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
576 }
577 }
578
579 my $doc_section = 0; # just for this document
580
581 my $text = $documenttag;
582
583 # get the text for this document
584 my $section = $doc_obj->get_top_section();
585 while (defined $section) {
586 # update a few statistics
587 $doc_section++;
588 $self->{'num_sections'} += 1;
589 $text .= "$sectiontag";
590
591 if ($indexed_doc) {
592 if ($self->{'indexing_text'}) {
593 $text .= "$paratag"; # only add para tags for indexing
594 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
595 }
596 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
597 foreach my $field (split (/,/, $fields)) {
598 # only deal with this field if it doesn't start with top or
599 # this is the first section
600 my $real_field = $field;
601 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
602 my $new_text = "";
603 my $tmp_text = "";
604 if ($real_field eq "text") {
605 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
606 $new_text .= "$paratag<TX>\n";
607 $tmp_text .= $doc_obj->get_text ($section);
608 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
609
610 $new_text .= "$tmp_text</TX>\n";
611 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
612 #$self->{'indexfields'}->{'TextOnly'} = 1;
613 #}
614 }
615 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
616 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
617 }
618 } else { # metadata field
619 if ($real_field eq "allfields") { #ignore
620 }
621 elsif ($real_field eq "metadata") { # insert all metadata
622 #except gsdl stuff
623 my $shortname = "";
624 my $metadata = $doc_obj->get_all_metadata ($section);
625 foreach $pair (@$metadata) {
626 my ($mfield, $mvalue) = (@$pair);
627 # check fields here, maybe others dont want - change to use dontindex!!
628 if ($mfield ne "Identifier"
629 && $mfield !~ /^gsdl/
630 && $mfield ne "classifytype"
631 && $mfield ne "assocfilepath"
632 && defined $mvalue && $mvalue ne "") {
633
634 if (defined $self->{'indexfieldmap'}->{$mfield}) {
635 $shortname = $self->{'indexfieldmap'}->{$mfield};
636 }
637 else {
638 $shortname = $self->create_shortname($mfield);
639 $self->{'indexfieldmap'}->{$mfield} = $shortname;
640 $self->{'indexfieldmap'}->{$shortname} = 1;
641 }
642 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
643 if (!defined $self->{'indexfields'}->{$mfield}) {
644 $self->{'indexfields'}->{$mfield} = 1;
645 }
646 }
647 }
648
649 }
650 else { #individual metadata specified
651 my $shortname="";
652 #if (!defined $self->{'indexfields'}->{$real_field}) {
653 #$self->{'indexfields'}->{$real_field} = 1;
654 #}
655 if (defined $self->{'indexfieldmap'}->{$real_field}) {
656 $shortname = $self->{'indexfieldmap'}->{$real_field};
657 }
658 else {
659 $shortname = $self->create_shortname($real_field);
660 $self->{'indexfieldmap'}->{$real_field} = $shortname;
661 $self->{'indexfieldmap'}->{$shortname} = 1;
662 }
663 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
664 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
665 }
666 }
667
668 }
669
670 # filter the text
671 $self->filter_text ($field, $new_text);
672
673 $self->{'num_processed_bytes'} += length ($new_text);
674 $text .= "$new_text";
675 }
676 }
677 } # if (indexed_doc)
678
679 $section = $doc_obj->get_next_section($section);
680 } #while defined section
681 print $handle "$text\n$documentendtag";
682
683}
684
685#chooses the first two letters or digits for the shortname
686#now ignores non-letdig characters
687sub create_shortname {
688 $self = shift(@_);
689
690 my ($realname) = @_;
691 #take the first two chars
692 my $shortname;
693 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
694 $shortname = "$1$2";
695 } else {
696 # there aren't two letdig's in the field - try arbitrary combinations
697 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
698 $shortname = "AB";
699 }
700 $shortname =~ tr/a-z/A-Z/;
701
702 #if already used, take the first and third letdigs and so on
703 $count = 1;
704 while (defined $self->{'indexfieldmap'}->{$shortname}) {
705 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
706 $shortname = "$1$3";
707 $count++;
708 $shortname =~ tr/a-z/A-Z/;
709
710 }
711 else {
712 #remove up to and incl the first letdig
713 $realname =~ s/^[^\w]*\w//;
714 $count = 0;
715 }
716 }
717
718 return $shortname;
719}
720
7211;
722
Note: See TracBrowser for help on using the repository browser.