source: trunk/gsdl/perllib/mgppbuildproc.pm@ 13590

Last change on this file since 13590 was 13590, checked in by kjdon, 17 years ago

mgpp and lucene. made them always use doc and sec levels for the text regardless of index level specification. mgpp will always index at doc and sec level, but these options may not be presented to the user. this is to ensure that if we have sectioned documents, we don't need to turn on section indexing in order for the document display to use sections

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36
37BEGIN {
38 @mgppbuildproc::ISA = ('basebuildproc');
39}
40
41#this must be the same as in mgppbuilder
42our %level_map = ('document'=>'Doc',
43 'section'=>'Sec',
44 'paragraph'=>'Para');
45
46sub new {
47 my $class = shift @_;
48 my $self = new basebuildproc (@_);
49
50 # use a different index specification to the default
51 $self->{'index'} = "text";
52
53 $self->{'dontindex'} = {};
54 $self->{'indexfieldmap'} = {};
55 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
56 $self->{'strip_html'}=1;
57
58 return bless $self, $class;
59}
60
61
62sub set_indexfieldmap {
63 my $self = shift (@_);
64 my ($indexmap) = @_;
65
66 $self->{'indexfieldmap'} = $indexmap;
67}
68
69sub get_indexfieldmap {
70 my $self = shift (@_);
71
72 return $self->{'indexfieldmap'};
73}
74
75sub set_levels {
76 my $self = shift (@_);
77 my ($levels) = @_;
78
79 $self->{'levels'} = $levels;
80}
81
82sub set_strip_html {
83 my $self = shift (@_);
84 my ($strip) = @_;
85 $self->{'strip_html'}=$strip;
86}
87
88#sub find_paragraphs {
89# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
90#}
91
92sub remove_gtlt {
93 my $self =shift(@_);
94 my ($text, $para) = @_;
95 $text =~s/[<>]//g;
96 return "$para$text$para";
97}
98
99sub process_tags {
100 my $self = shift(@_);
101 my ($text, $para) = @_;
102 if ($text =~ /^p\b/i) {
103 return $para;
104 }
105 return "";
106}
107
108sub preprocess_text {
109 my $self = shift (@_);
110 my ($text, $strip_html, $para) = @_;
111 # at this stage, we do not do paragraph tags unless have strip_html -
112 # it will result in a huge mess of non-xml
113 return unless $strip_html;
114
115 my $new_text = $text;
116
117 # if we have <pre> tags, we can have < > inside them, need to delete
118 # the <> before stripping tags
119 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
120
121 if ($para eq "") {
122 # just remove all tags
123 $new_text =~ s/<[^>]*>//gs;
124 } else {
125 # strip all tags except <p> tags which get turned into $para
126 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
127
128 }
129 return $new_text;
130}
131#this function strips the html tags from the doc if ($strip_html) and
132# if ($para) replaces <p> with <Paragraph> tags.
133# if both are false, the original text is returned
134#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
135#these tags
136sub preprocess_text_old_and_slow {
137 my $self = shift (@_);
138 my ($text, $strip_html, $para) = @_;
139 my ($outtext) = "";
140 if ($strip_html) {
141 while ($text =~ /<([^>]*)>/ && $text ne "") {
142
143 my $tag = $1;
144 $outtext .= $`." "; #add everything before the matched tag
145 $text = $'; #'everything after the matched tag
146 if ($para && $tag =~ /^\s*p\s/i) {
147 $outtext .= $para;
148 }
149 elsif ($tag =~ /^pre$/) { # a pre tag
150 $text =~ /<\/pre>/; # find the closing pre tag
151 my $tmp_text = $`; #everything before the closing pre tag
152 $text = $'; #'everything after the </pre>
153 $tmp_text =~ s/[<>]//g; # remove all < and >
154 $outtext.= $tmp_text . " ";
155 }
156 }
157
158 $outtext .= $text; # add any remaining text
159 return $outtext;
160 } #if strip_html
161
162 #if ($para) {
163 #$text =~ s/(<p\b)/$para$1/gi;
164 #return $text;
165 # }
166 return $text;
167}
168
169
170
171sub filter_text {
172 # $self->filter_text ($field, $new_text);
173 # don't want to do anything for this version, however,
174 # in a particular collection you might want to override
175 # this method to post-process certain fields depending on
176 # the field, or whether we are outputting it for indexing
177}
178
179sub text {
180 my $self = shift (@_);
181 my ($doc_obj) = @_;
182 my $handle = $self->{'output_handle'};
183 my $outhandle = $self->{'outhandle'};
184
185 # only output this document if it is one to be indexed
186 return if ($doc_obj->get_doc_type() ne "indexed_doc");
187
188 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
189
190 # this is another document
191 $self->{'num_docs'} += 1;
192
193 # get the parameters for the output
194 # split on : just in case there is subcoll and lang stuff
195 my ($fields) = split (/:/, $self->{'index'});
196
197 # we always do text and index on Doc and Sec levels
198 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
199 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
200 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
201 my ($paratag) = "";
202
203 # paragraph tags will only be used for indexing (can't retrieve
204 # paragraphs), and can ony be used if we are stripping HTML tags
205 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
206 if ($self->{'strip_html'}) {
207 $paratag = "<". $level_map{'paragraph'} . ">";
208 } else {
209 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
210 }
211 }
212
213 my $doc_section = 0; # just for this document
214
215 my $text = $documenttag;
216
217 # get the text for this document
218 my $section = $doc_obj->get_top_section();
219
220 while (defined $section) {
221 # update a few statistics
222 $doc_section++;
223 $self->{'num_sections'} += 1;
224 $text .= "$sectiontag";
225
226 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
227 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
228 # we are not actually indexing anything for this document,
229 # but we want to keep the section numbers the same, so we just
230 # output section tags for each section (which is done above)
231 $section = $doc_obj->get_next_section($section);
232 next;
233 }
234
235 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
236 foreach my $field (split (/;/, $fields)) {
237 # only deal with this field if it doesn't start with top or
238 # this is the first section
239 my $real_field = $field;
240 next if (($real_field =~ s/^top//) && ($doc_section != 1));
241
242 my $new_text = "";
243
244 # we get allfields by default - do nothing
245 if ($real_field eq "allfields") {
246
247 }
248
249 # metadata - output all metadata we know about except gsdl stuff
250 elsif ($real_field eq "metadata") {
251 my $shortname = "";
252 my $metadata = $doc_obj->get_all_metadata ($section);
253 foreach my $pair (@$metadata) {
254 my ($mfield, $mvalue) = (@$pair);
255 # check fields here, maybe others dont want - change to use dontindex!!
256 if ($mfield ne "Identifier"
257 && $mfield !~ /^gsdl/
258 && $mfield ne "classifytype"
259 && $mfield ne "assocfilepath"
260 && defined $mvalue && $mvalue ne "") {
261
262 if (defined $self->{'indexfieldmap'}->{$mfield}) {
263 $shortname = $self->{'indexfieldmap'}->{$mfield};
264 }
265 else {
266 $shortname = $self->create_shortname($mfield);
267 $self->{'indexfieldmap'}->{$mfield} = $shortname;
268 $self->{'indexfieldmap'}->{$shortname} = 1;
269 }
270 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
271 if (!defined $self->{'indexfields'}->{$mfield}) {
272 $self->{'indexfields'}->{$mfield} = 1;
273 }
274 }
275 }
276 }
277 else {
278 #individual metadata and or text specified - could be
279 # a comma separated list
280 my $shortname="";
281 if (defined $self->{'indexfieldmap'}->{$real_field}) {
282 $shortname = $self->{'indexfieldmap'}->{$real_field};
283 }
284 else {
285 $shortname = $self->create_shortname($real_field);
286 $self->{'indexfieldmap'}->{$real_field} = $shortname;
287 $self->{'indexfieldmap'}->{$shortname} = 1;
288 }
289 my @metadata_list = ();
290 foreach my $submeta (split /,/, $real_field) {
291 if ($submeta eq "text") {
292 my $section_text = $doc_obj->get_text($section);
293 if ($self->{'indexing_text'}) {
294 # tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
295 $new_text .= "$paratag<$shortname>\n";
296 if ($paratag ne "") {
297 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
298 }
299 else {
300 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
301 }
302 $new_text .= "$section_text</$shortname>\n";
303 }
304 else {
305 # leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
306 $new_text .= $section_text;
307 }
308 }
309 else {
310 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
311 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
312 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
313 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
314 }
315 }
316 push (@metadata_list, @section_metadata);
317 }
318 }
319 foreach my $item (@metadata_list) {
320 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
321 }
322 }
323
324 # filter the text
325 $self->filter_text ($field, $new_text);
326
327 $self->{'num_processed_bytes'} += length ($new_text);
328 $text .= "$new_text";
329 } # foreach field
330
331 $section = $doc_obj->get_next_section($section);
332 } # while defined section
333 print $handle "$text\n$documentendtag";
334
335}
336
337#chooses the first two letters or digits for the shortname
338#now ignores non-letdig characters
339sub create_shortname {
340 my $self = shift(@_);
341
342 my ($realname) = @_;
343 #take the first two chars
344 my $shortname;
345 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
346 $shortname = "$1$2";
347 } else {
348 # there aren't two letdig's in the field - try arbitrary combinations
349 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
350 $shortname = "AB";
351 }
352 $shortname =~ tr/a-z/A-Z/;
353
354 #if already used, take the first and third letdigs and so on
355 my $count = 1;
356 while (defined $self->{'indexfieldmap'}->{$shortname}) {
357 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
358 $shortname = "$1$3";
359 $count++;
360 $shortname =~ tr/a-z/A-Z/;
361
362 }
363 else {
364 #remove up to and incl the first letdig
365 $realname =~ s/^[^\w]*\w//;
366 $count = 0;
367 }
368 }
369
370 return $shortname;
371}
372
3731;
374
Note: See TracBrowser for help on using the repository browser.