source: gsdl/trunk/perllib/mgppbuildproc.pm@ 18379

Last change on this file since 18379 was 17797, checked in by kjdon, 15 years ago

my previous changes to the text method meant that field tags were being output for text as well as for index. Don't want this - they end up in the HTML and give errors. So now tags are only output if ->'indexing_text'}

  • Property svn:keywords set to Author Date Id Revision
File size: 14.0 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
48# change this so a user can add their own ones in via a file or cfg
49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50#also add the level names (Doc, Sec, Para)
51our %static_indexfield_map = ('Title'=>'TI',
52 'TI'=>1,
53 'Subject'=>'SU',
54 'SU'=>1,
55 'Creator'=>'CR',
56 'CR'=>1,
57 'Organization'=>'ORG',
58 'ORG'=>1,
59 'Source'=>'SO',
60 'SO'=>1,
61 'Howto'=>'HT',
62 'HT'=>1,
63 'ItemTitle'=>'IT',
64 'IT'=>1,
65 'ProgNumber'=>'PN',
66 'PN'=>1,
67 'People'=>'PE',
68 'PE'=>1,
69 'Coverage'=>'CO',
70 'CO'=>1,
71 'allfields'=>'ZZ',
72 'ZZ'=>1,
73 'text'=>'TX',
74 'TX'=>1,
75 'AND'=>1,
76 'OR'=>1,
77 'NOT'=>1,
78 'NEAR'=>1,
79 'Doc'=>1,
80 'Sec'=>1,
81 'Para'=>1);
82
83
84sub new {
85 my $class = shift @_;
86 my $self = new basebuildproc (@_);
87
88 # use a different index specification to the default
89 $self->{'index'} = "text";
90
91 $self->{'dontindex'} = {};
92 $self->{'indexfieldmap'} = {};
93 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
94 $self->{'strip_html'}=1;
95
96 return bless $self, $class;
97}
98
99
100#sub set_indexfieldmap {
101# my $self = shift (@_);
102# my ($indexmap) = @_;
103
104# $self->{'default_index_field_mapping'} = $indexmap;
105 #$self->{'indexfieldmap'} = $indexmap;
106#}
107
108sub get_indexfieldmap {
109 my $self = shift (@_);
110
111 return $self->{'indexfieldmap'};
112}
113
114sub set_levels {
115 my $self = shift (@_);
116 my ($levels) = @_;
117
118 $self->{'levels'} = $levels;
119}
120
121sub set_strip_html {
122 my $self = shift (@_);
123 my ($strip) = @_;
124 $self->{'strip_html'}=$strip;
125}
126
127#sub find_paragraphs {
128# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
129#}
130
131sub remove_gtlt {
132 my $self =shift(@_);
133 my ($text, $para) = @_;
134 $text =~s/[<>]//g;
135 return "$para$text$para";
136}
137
138sub process_tags {
139 my $self = shift(@_);
140 my ($text, $para) = @_;
141 if ($text =~ /^p\b/i) {
142 return $para;
143 }
144 return "";
145}
146
147sub preprocess_text {
148 my $self = shift (@_);
149 my ($text, $strip_html, $para) = @_;
150 # at this stage, we do not do paragraph tags unless have strip_html -
151 # it will result in a huge mess of non-xml
152 return unless $strip_html;
153
154 my $new_text = $text;
155
156 # if we have <pre> tags, we can have < > inside them, need to delete
157 # the <> before stripping tags
158 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
159
160 if ($para eq "") {
161 # just remove all tags
162 $new_text =~ s/<[^>]*>//gs;
163 } else {
164 # strip all tags except <p> tags which get turned into $para
165 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
166
167 }
168 return $new_text;
169}
170#this function strips the html tags from the doc if ($strip_html) and
171# if ($para) replaces <p> with <Paragraph> tags.
172# if both are false, the original text is returned
173#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
174#these tags
175sub preprocess_text_old_and_slow {
176 my $self = shift (@_);
177 my ($text, $strip_html, $para) = @_;
178 my ($outtext) = "";
179 if ($strip_html) {
180 while ($text =~ /<([^>]*)>/ && $text ne "") {
181
182 my $tag = $1;
183 $outtext .= $`." "; #add everything before the matched tag
184 $text = $'; #'everything after the matched tag
185 if ($para && $tag =~ /^\s*p\s/i) {
186 $outtext .= $para;
187 }
188 elsif ($tag =~ /^pre$/) { # a pre tag
189 $text =~ /<\/pre>/; # find the closing pre tag
190 my $tmp_text = $`; #everything before the closing pre tag
191 $text = $'; #'everything after the </pre>
192 $tmp_text =~ s/[<>]//g; # remove all < and >
193 $outtext.= $tmp_text . " ";
194 }
195 }
196
197 $outtext .= $text; # add any remaining text
198 return $outtext;
199 } #if strip_html
200
201 #if ($para) {
202 #$text =~ s/(<p\b)/$para$1/gi;
203 #return $text;
204 # }
205 return $text;
206}
207
208sub text {
209 my $self = shift (@_);
210 my ($doc_obj) = @_;
211 my $handle = $self->{'output_handle'};
212 my $outhandle = $self->{'outhandle'};
213
214 # only output this document if it is one to be indexed
215 return if ($doc_obj->get_doc_type() ne "indexed_doc");
216
217 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
218
219 # this is another document
220 $self->{'num_docs'} += 1;
221
222 # get the parameters for the output
223 # split on : just in case there is subcoll and lang stuff
224 my ($fields) = split (/:/, $self->{'index'});
225
226 # we always do text and index on Doc and Sec levels
227 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
228 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
229 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
230 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
231
232 my ($paratag) = "";
233
234 # paragraph tags will only be used for indexing (can't retrieve
235 # paragraphs), and can ony be used if we are stripping HTML tags
236 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
237 if ($self->{'strip_html'}) {
238 $paratag = "<". $level_map{'paragraph'} . ">";
239 } else {
240 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
241 }
242 }
243
244 my $doc_section = 0; # just for this document
245
246 my $text = $documenttag;
247
248 # get the text for this document
249 my $section = $doc_obj->get_top_section();
250
251 while (defined $section) {
252 # update a few statistics
253 $doc_section++;
254 $self->{'num_sections'} += 1;
255 $text .= "$sectiontag";
256
257 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
258 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
259 # we are not actually indexing anything for this document,
260 # but we want to keep the section numbers the same, so we just
261 # output section tags for each section (which is done above)
262 $text .= "$sectionendtag";
263 $section = $doc_obj->get_next_section($section);
264 next;
265 }
266
267 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
268
269 # has the user added a 'metadata' index?
270 my $all_metadata_specified = 0;
271 # which fields have already been indexed? (same as fields, but in a map)
272 my $specified_fields = {};
273 foreach my $field (split (/;/, $fields)) {
274 # only deal with this field if it doesn't start with top or
275 # this is the first section
276 my $real_field = $field;
277 next if (($real_field =~ s/^top//) && ($doc_section != 1));
278
279 my $new_text = "";
280
281 # we get allfields by default - do nothing except add into the map
282 if ($real_field eq "allfields") {
283 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
284 $self->{'indexfieldmap'}->{"ZZ"} = 1;
285 }
286
287 # metadata - output all metadata we know about except gsdl stuff
288 # each metadata is in a separate index field
289 if ($real_field eq "metadata") {
290 # we will process this later, so we are not reindexing metadata already indexed
291 $all_metadata_specified = 1;
292 }
293
294 else {
295
296 #individual metadata and or text specified - could be
297 # a comma separated list
298 $specified_fields->{$real_field} = 1;
299 my $shortname="";
300 my $new_field = 0; # have we found a new field name?
301
302 if (defined $self->{'indexfieldmap'}->{$real_field}) {
303 $shortname = $self->{'indexfieldmap'}->{$real_field};
304 }
305 else {
306 $shortname = $self->create_shortname($real_field);
307 $new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values
308 }
309 my @metadata_list = (); # put any meta values in here
310 my $section_text = ""; # put any text in here
311 foreach my $submeta (split /,/, $real_field) {
312 if ($submeta eq "text") {
313 # no point in indexing text more than once
314 if ($section_text eq "") {
315 $section_text = $doc_obj->get_text($section);
316 if ($self->{'indexing_text'}) {
317 if ($paratag ne "") {
318 # we fiddle around with splitting text into paragraphs
319 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
320 }
321 else {
322 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
323 }
324 }
325 }
326 }
327 else {
328 # its a metadata element
329 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
330 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
331 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
332 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
333 }
334 }
335 push (@metadata_list, @section_metadata);
336 }
337 } # for each field in index
338
339
340 # now we add the text and/or the metadata into new_text
341 if ($section_text ne "" || scalar(@metadata_list)) {
342 if ($self->{'indexing_text'}) {
343 # only add tags in if indexing
344 $new_text .= "$paratag<$shortname>";
345 }
346 if ($section_text ne "") {
347 $new_text .= "$section_text ";
348 if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
349 $new_text .= "</$shortname>$paratag<$shortname>";
350 }
351 }
352 foreach my $item (@metadata_list) {
353 $new_text .= "$item ";
354 }
355 if ($self->{'indexing_text'}) {
356 # only add tags in if indexing
357 $new_text .= "</$shortname>";
358 }
359 if ($self->{'indexing_text'} && $new_field) {
360 # we need to add to the list in indexfields
361
362 $self->{'indexfieldmap'}->{$real_field} = $shortname;
363 $self->{'indexfieldmap'}->{$shortname} = 1;
364 }
365 }
366 }
367
368 # filter the text
369 $new_text = $self->filter_text ($field, $new_text);
370
371 $self->{'num_processed_bytes'} += length ($new_text);
372 $text .= "$new_text";
373 } # foreach field
374
375 if ($all_metadata_specified) {
376 my $new_text = "";
377 my $shortname = "";
378 my $metadata = $doc_obj->get_all_metadata ($section);
379 foreach my $pair (@$metadata) {
380 my ($mfield, $mvalue) = (@$pair);
381 # no value
382 next unless defined $mvalue && $mvalue ne "";
383 # we have already indexed this
384 next if defined ($specified_fields->{$mfield});
385 # check fields here, maybe others dont want - change to use dontindex!!
386 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
387 next if ($mfield =~ /^gsdl/);
388
389
390 if (defined $self->{'indexfieldmap'}->{$mfield}) {
391 $shortname = $self->{'indexfieldmap'}->{$mfield};
392 }
393 else {
394 $shortname = $self->create_shortname($mfield);
395 $self->{'indexfieldmap'}->{$mfield} = $shortname;
396 $self->{'indexfieldmap'}->{$shortname} = 1;
397 }
398 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
399 if (!defined $self->{'indexfields'}->{$mfield}) {
400 $self->{'indexfields'}->{$mfield} = 1;
401 }
402
403 }
404 # filter the text
405 $new_text = $self->filter_text ("metadata", $new_text);
406
407 $self->{'num_processed_bytes'} += length ($new_text);
408 $text .= "$new_text";
409
410
411 }
412
413 $text .= "$sectionendtag";
414 $section = $doc_obj->get_next_section($section);
415 } # while defined section
416 print $handle "$text\n$documentendtag";
417 #print STDERR "***********\n$text\n***************\n";
418
419}
420
421#chooses the first two letters or digits for the shortname
422#now ignores non-letdig characters
423sub create_shortname {
424 my $self = shift(@_);
425
426 my ($realname) = @_;
427 # try our predefined static mapping
428 if (defined $static_indexfield_map{$realname}) {
429 return $static_indexfield_map{$realname};
430 }
431 #try the first two chars
432 my $shortname;
433 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
434 $shortname = "$1$2";
435 } else {
436 # there aren't two letdig's in the field - try arbitrary combinations
437 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
438 $shortname = "AB";
439 }
440 $shortname =~ tr/a-z/A-Z/;
441
442 #if already used, take the first and third letdigs and so on
443 my $count = 1;
444 while (defined $self->{'indexfieldmap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
445 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
446 $shortname = "$1$3";
447 $count++;
448 $shortname =~ tr/a-z/A-Z/;
449
450 }
451 else {
452 #remove up to and incl the first letdig
453 $realname =~ s/^[^\w]*\w//;
454 $count = 0;
455 }
456 }
457
458 return $shortname;
459}
460
4611;
462
Note: See TracBrowser for help on using the repository browser.