source: gsdl/trunk/perllib/mgppbuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 11.4 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
48sub new {
49 my $class = shift @_;
50 my $self = new basebuildproc (@_);
51
52 # use a different index specification to the default
53 $self->{'index'} = "text";
54
55 $self->{'dontindex'} = {};
56 $self->{'indexfieldmap'} = {};
57 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
58 $self->{'strip_html'}=1;
59
60 return bless $self, $class;
61}
62
63
64sub set_indexfieldmap {
65 my $self = shift (@_);
66 my ($indexmap) = @_;
67
68 $self->{'indexfieldmap'} = $indexmap;
69}
70
71sub get_indexfieldmap {
72 my $self = shift (@_);
73
74 return $self->{'indexfieldmap'};
75}
76
77sub set_levels {
78 my $self = shift (@_);
79 my ($levels) = @_;
80
81 $self->{'levels'} = $levels;
82}
83
84sub set_strip_html {
85 my $self = shift (@_);
86 my ($strip) = @_;
87 $self->{'strip_html'}=$strip;
88}
89
90#sub find_paragraphs {
91# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
92#}
93
94sub remove_gtlt {
95 my $self =shift(@_);
96 my ($text, $para) = @_;
97 $text =~s/[<>]//g;
98 return "$para$text$para";
99}
100
101sub process_tags {
102 my $self = shift(@_);
103 my ($text, $para) = @_;
104 if ($text =~ /^p\b/i) {
105 return $para;
106 }
107 return "";
108}
109
110sub preprocess_text {
111 my $self = shift (@_);
112 my ($text, $strip_html, $para) = @_;
113 # at this stage, we do not do paragraph tags unless have strip_html -
114 # it will result in a huge mess of non-xml
115 return unless $strip_html;
116
117 my $new_text = $text;
118
119 # if we have <pre> tags, we can have < > inside them, need to delete
120 # the <> before stripping tags
121 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
122
123 if ($para eq "") {
124 # just remove all tags
125 $new_text =~ s/<[^>]*>//gs;
126 } else {
127 # strip all tags except <p> tags which get turned into $para
128 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
129
130 }
131 return $new_text;
132}
133#this function strips the html tags from the doc if ($strip_html) and
134# if ($para) replaces <p> with <Paragraph> tags.
135# if both are false, the original text is returned
136#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
137#these tags
138sub preprocess_text_old_and_slow {
139 my $self = shift (@_);
140 my ($text, $strip_html, $para) = @_;
141 my ($outtext) = "";
142 if ($strip_html) {
143 while ($text =~ /<([^>]*)>/ && $text ne "") {
144
145 my $tag = $1;
146 $outtext .= $`." "; #add everything before the matched tag
147 $text = $'; #'everything after the matched tag
148 if ($para && $tag =~ /^\s*p\s/i) {
149 $outtext .= $para;
150 }
151 elsif ($tag =~ /^pre$/) { # a pre tag
152 $text =~ /<\/pre>/; # find the closing pre tag
153 my $tmp_text = $`; #everything before the closing pre tag
154 $text = $'; #'everything after the </pre>
155 $tmp_text =~ s/[<>]//g; # remove all < and >
156 $outtext.= $tmp_text . " ";
157 }
158 }
159
160 $outtext .= $text; # add any remaining text
161 return $outtext;
162 } #if strip_html
163
164 #if ($para) {
165 #$text =~ s/(<p\b)/$para$1/gi;
166 #return $text;
167 # }
168 return $text;
169}
170
171sub text {
172 my $self = shift (@_);
173 my ($doc_obj) = @_;
174 my $handle = $self->{'output_handle'};
175 my $outhandle = $self->{'outhandle'};
176
177 # only output this document if it is one to be indexed
178 return if ($doc_obj->get_doc_type() ne "indexed_doc");
179
180 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
181
182 # this is another document
183 $self->{'num_docs'} += 1;
184
185 # get the parameters for the output
186 # split on : just in case there is subcoll and lang stuff
187 my ($fields) = split (/:/, $self->{'index'});
188
189 # we always do text and index on Doc and Sec levels
190 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
191 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
192 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
193 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
194
195 my ($paratag) = "";
196
197 # paragraph tags will only be used for indexing (can't retrieve
198 # paragraphs), and can ony be used if we are stripping HTML tags
199 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
200 if ($self->{'strip_html'}) {
201 $paratag = "<". $level_map{'paragraph'} . ">";
202 } else {
203 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
204 }
205 }
206
207 my $doc_section = 0; # just for this document
208
209 my $text = $documenttag;
210
211 # get the text for this document
212 my $section = $doc_obj->get_top_section();
213
214 while (defined $section) {
215 # update a few statistics
216 $doc_section++;
217 $self->{'num_sections'} += 1;
218 $text .= "$sectiontag";
219
220 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
221 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
222 # we are not actually indexing anything for this document,
223 # but we want to keep the section numbers the same, so we just
224 # output section tags for each section (which is done above)
225 $text .= "$sectionendtag";
226 $section = $doc_obj->get_next_section($section);
227 next;
228 }
229
230 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
231 foreach my $field (split (/;/, $fields)) {
232 # only deal with this field if it doesn't start with top or
233 # this is the first section
234 my $real_field = $field;
235 next if (($real_field =~ s/^top//) && ($doc_section != 1));
236
237 my $new_text = "";
238
239 # we get allfields by default - do nothing
240 if ($real_field eq "allfields") {
241
242 }
243
244 # metadata - output all metadata we know about except gsdl stuff
245 elsif ($real_field eq "metadata") {
246 my $shortname = "";
247 my $metadata = $doc_obj->get_all_metadata ($section);
248 foreach my $pair (@$metadata) {
249 my ($mfield, $mvalue) = (@$pair);
250 # check fields here, maybe others dont want - change to use dontindex!!
251 if ($mfield ne "Identifier"
252 && $mfield !~ /^gsdl/
253 && $mfield ne "classifytype"
254 && $mfield ne "assocfilepath"
255 && defined $mvalue && $mvalue ne "") {
256
257 if (defined $self->{'indexfieldmap'}->{$mfield}) {
258 $shortname = $self->{'indexfieldmap'}->{$mfield};
259 }
260 else {
261 $shortname = $self->create_shortname($mfield);
262 $self->{'indexfieldmap'}->{$mfield} = $shortname;
263 $self->{'indexfieldmap'}->{$shortname} = 1;
264 }
265 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
266 if (!defined $self->{'indexfields'}->{$mfield}) {
267 $self->{'indexfields'}->{$mfield} = 1;
268 }
269 }
270 }
271 }
272 else {
273 #individual metadata and or text specified - could be
274 # a comma separated list
275 my $shortname="";
276 if (defined $self->{'indexfieldmap'}->{$real_field}) {
277 $shortname = $self->{'indexfieldmap'}->{$real_field};
278 }
279 else {
280 $shortname = $self->create_shortname($real_field);
281 $self->{'indexfieldmap'}->{$real_field} = $shortname;
282 $self->{'indexfieldmap'}->{$shortname} = 1;
283 }
284 my @metadata_list = ();
285 foreach my $submeta (split /,/, $real_field) {
286 if ($submeta eq "text") {
287 my $section_text = $doc_obj->get_text($section);
288 if ($self->{'indexing_text'}) {
289 # tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
290 $new_text .= "$paratag<$shortname>\n";
291 if ($paratag ne "") {
292 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
293 }
294 else {
295 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
296 }
297 $new_text .= "$section_text</$shortname>\n";
298 }
299 else {
300 # leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
301 $new_text .= $section_text;
302 }
303 }
304 else {
305 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
306 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
307 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
308 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
309 }
310 }
311 push (@metadata_list, @section_metadata);
312 }
313 }
314 foreach my $item (@metadata_list) {
315 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
316 }
317 }
318
319 # filter the text
320 $new_text = $self->filter_text ($field, $new_text);
321
322 $self->{'num_processed_bytes'} += length ($new_text);
323 $text .= "$new_text";
324 } # foreach field
325
326 $text .= "$sectionendtag";
327 $section = $doc_obj->get_next_section($section);
328 } # while defined section
329 print $handle "$text\n$documentendtag";
330
331}
332
333#chooses the first two letters or digits for the shortname
334#now ignores non-letdig characters
335sub create_shortname {
336 my $self = shift(@_);
337
338 my ($realname) = @_;
339 #take the first two chars
340 my $shortname;
341 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
342 $shortname = "$1$2";
343 } else {
344 # there aren't two letdig's in the field - try arbitrary combinations
345 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
346 $shortname = "AB";
347 }
348 $shortname =~ tr/a-z/A-Z/;
349
350 #if already used, take the first and third letdigs and so on
351 my $count = 1;
352 while (defined $self->{'indexfieldmap'}->{$shortname}) {
353 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
354 $shortname = "$1$3";
355 $count++;
356 $shortname =~ tr/a-z/A-Z/;
357
358 }
359 else {
360 #remove up to and incl the first letdig
361 $realname =~ s/^[^\w]*\w//;
362 $count = 0;
363 }
364 }
365
366 return $shortname;
367}
368
3691;
370
Note: See TracBrowser for help on using the repository browser.