source: trunk/gsdl/perllib/mgppbuildproc.pm@ 11089

Last change on this file since 11089 was 10961, checked in by kjdon, 18 years ago

modified mgpp/lucene indexes description so that you can have multiple metadata values in a single field, eg dc.Creator,dc.Contributor. indexes list looks more like mg one. e.g 'dc.Subject text dc.Creator,dc.Contributor'. also made the preprocess_text faster (I think), and a little bit of tidying up. if no_text is true, then don't run the compress text pass at all - its a waste of reading through the documents. so don't use store_text in these two buildprocs anymore

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33
34BEGIN {
35 @mgppbuildproc::ISA = ('basebuildproc');
36}
37
38#this must be the same as in mgppbuilder
39our %level_map = ('document'=>'Doc',
40 'section'=>'Sec',
41 'paragraph'=>'Para');
42
43sub new {
44 my $class = shift @_;
45 my $self = new basebuildproc (@_);
46
47 # use a different index specification to the default
48 $self->{'index'} = "text";
49
50 $self->{'dontindex'} = {};
51 $self->{'indexfieldmap'} = {};
52 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
53 $self->{'strip_html'}=1;
54
55 return bless $self, $class;
56}
57
58
59sub set_indexfieldmap {
60 my $self = shift (@_);
61 my ($indexmap) = @_;
62
63 $self->{'indexfieldmap'} = $indexmap;
64}
65
66sub get_indexfieldmap {
67 my $self = shift (@_);
68
69 return $self->{'indexfieldmap'};
70}
71
72sub set_levels {
73 my $self = shift (@_);
74 my ($levels) = @_;
75
76 $self->{'levels'} = $levels;
77}
78
79sub set_strip_html {
80 my $self = shift (@_);
81 my ($strip) = @_;
82 $self->{'strip_html'}=$strip;
83}
84
85
86sub get_gdbm_level {
87 my $self = shift (@_);
88
89 #if a Section level index is not built, the gdbm file should be at doc
90 #level not Section
91 if ($self->{'levels'}->{'section'}) {
92 return "section";
93 }
94 return "document";
95}
96
97
98#sub find_paragraphs {
99# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
100#}
101
102sub remove_gtlt {
103 my $self =shift(@_);
104 my ($text, $para) = @_;
105 $text =~s/[<>]//g;
106 return "$para$text$para";
107}
108
109sub process_tags {
110 my $self = shift(@_);
111 my ($text, $para) = @_;
112 if ($text =~ /^p\b/i) {
113 return $para;
114 }
115 return "";
116}
117
118sub preprocess_text {
119 my $self = shift (@_);
120 my ($text, $strip_html, $para) = @_;
121 # at this stage, we do not do paragraph tags unless have strip_html -
122 # it will result in a huge mess of non-xml
123 return unless $strip_html;
124
125 my $new_text = $text;
126
127 # if we have <pre> tags, we can have < > inside them, need to delete
128 # the <> before stripping tags
129 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
130
131 if ($para eq "") {
132 # just remove all tags
133 $new_text =~ s/<[^>]*>//gs;
134 } else {
135 # strip all tags except <p> tags which get turned into $para
136 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
137
138 }
139 return $new_text;
140}
141#this function strips the html tags from the doc if ($strip_html) and
142# if ($para) replaces <p> with <Paragraph> tags.
143# if both are false, the original text is returned
144#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
145#these tags
146sub preprocess_text_old_and_slow {
147 my $self = shift (@_);
148 my ($text, $strip_html, $para) = @_;
149 my ($outtext) = "";
150 if ($strip_html) {
151 while ($text =~ /<([^>]*)>/ && $text ne "") {
152
153 my $tag = $1;
154 $outtext .= $`." "; #add everything before the matched tag
155 $text = $'; #'everything after the matched tag
156 if ($para && $tag =~ /^\s*p\s/i) {
157 $outtext .= $para;
158 }
159 elsif ($tag =~ /^pre$/) { # a pre tag
160 $text =~ /<\/pre>/; # find the closing pre tag
161 my $tmp_text = $`; #everything before the closing pre tag
162 $text = $'; #'everything after the </pre>
163 $tmp_text =~ s/[<>]//g; # remove all < and >
164 $outtext.= $tmp_text . " ";
165 }
166 }
167
168 $outtext .= $text; # add any remaining text
169 return $outtext;
170 } #if strip_html
171
172 #if ($para) {
173 #$text =~ s/(<p\b)/$para$1/gi;
174 #return $text;
175 # }
176 return $text;
177}
178
179
180
181sub filter_text {
182 # $self->filter_text ($field, $new_text);
183 # don't want to do anything for this version, however,
184 # in a particular collection you might want to override
185 # this method to post-process certain fields depending on
186 # the field, or whether we are outputting it for indexing
187}
188
189sub text {
190 my $self = shift (@_);
191 my ($doc_obj) = @_;
192 my $handle = $self->{'output_handle'};
193 my $outhandle = $self->{'outhandle'};
194
195 # only output this document if it is one to be indexed
196 return if ($doc_obj->get_doc_type() ne "indexed_doc");
197
198 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
199
200 # this is another document
201 $self->{'num_docs'} += 1;
202
203 # get the parameters for the output
204 # split on : just in case there is subcoll and lang stuff
205 my ($fields) = split (/:/, $self->{'index'});
206
207 my ($documenttag) = "";
208 my($documentendtag) = "";
209 if ($self->{'levels'}->{'document'}) {
210 $documenttag = "\n<". $level_map{'document'} . ">\n";
211 $documentendtag = "\n</". $level_map{'document'} . ">\n";
212 }
213 my ($sectiontag) = "";
214 if ($self->{'levels'}->{'section'}) {
215 $sectiontag = "\n<". $level_map{'section'} . ">\n";
216 }
217 my ($paratag) = "";
218
219 # paragraph tags will only be used for indexing (can't retrieve
220 # paragraphs), and can ony be used if we are stripping HTML tags
221 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
222 if ($self->{'strip_html'}) {
223 $paratag = "<". $level_map{'paragraph'} . ">";
224 } else {
225 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
226 }
227 }
228
229 my $doc_section = 0; # just for this document
230
231 my $text = $documenttag;
232
233 # get the text for this document
234 my $section = $doc_obj->get_top_section();
235
236 while (defined $section) {
237 # update a few statistics
238 $doc_section++;
239 $self->{'num_sections'} += 1;
240 $text .= "$sectiontag";
241
242 if (!$indexed_doc) {
243 # we are not actually indexing anything for this document,
244 # but we want to keep the section numbers the same, so we just
245 # output section tags for each section (which is done above)
246 $section = $doc_obj->get_next_section($section);
247 next;
248 }
249
250 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
251
252 foreach my $field (split (/;/, $fields)) {
253 # only deal with this field if it doesn't start with top or
254 # this is the first section
255 my $real_field = $field;
256 next if (($real_field =~ s/^top//) && ($doc_section != 1));
257
258 my $new_text = "";
259 my $tmp_text = "";
260
261 # we get allfields by default - do nothing
262 if ($real_field eq "allfields") {
263
264 }
265
266 # metadata - output all metadata we know about except gsdl stuff
267 elsif ($real_field eq "metadata") {
268 my $shortname = "";
269 my $metadata = $doc_obj->get_all_metadata ($section);
270 foreach my $pair (@$metadata) {
271 my ($mfield, $mvalue) = (@$pair);
272 # check fields here, maybe others dont want - change to use dontindex!!
273 if ($mfield ne "Identifier"
274 && $mfield !~ /^gsdl/
275 && $mfield ne "classifytype"
276 && $mfield ne "assocfilepath"
277 && defined $mvalue && $mvalue ne "") {
278
279 if (defined $self->{'indexfieldmap'}->{$mfield}) {
280 $shortname = $self->{'indexfieldmap'}->{$mfield};
281 }
282 else {
283 $shortname = $self->create_shortname($mfield);
284 $self->{'indexfieldmap'}->{$mfield} = $shortname;
285 $self->{'indexfieldmap'}->{$shortname} = 1;
286 }
287 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
288 if (!defined $self->{'indexfields'}->{$mfield}) {
289 $self->{'indexfields'}->{$mfield} = 1;
290 }
291 }
292 }
293 }
294 else {
295 #individual metadata and or text specified - could be
296 # a comma separated list
297 my $shortname="";
298 if (defined $self->{'indexfieldmap'}->{$real_field}) {
299 $shortname = $self->{'indexfieldmap'}->{$real_field};
300 }
301 else {
302 $shortname = $self->create_shortname($real_field);
303 $self->{'indexfieldmap'}->{$real_field} = $shortname;
304 $self->{'indexfieldmap'}->{$shortname} = 1;
305 }
306 my @metadata_list = ();
307 foreach $submeta (split /,/, $real_field) {
308 if ($submeta eq "text") {
309 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
310 $new_text .= "$paratag<$shortname>\n";
311 $tmp_text .= $doc_obj->get_text ($section);
312 if ($paratag ne "") {
313 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
314 } else {
315 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
316 }
317 $new_text .= "$tmp_text</$shortname>\n";
318 }
319 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
320 $new_text .= $doc_obj->get_text ($section);
321 }
322 } else {
323 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
324 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
325 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
326 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
327 }
328 }
329 push (@metadata_list, @section_metadata);
330 }
331 }
332 foreach my $item (@metadata_list) {
333 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
334 }
335 }
336
337 # filter the text
338 $self->filter_text ($field, $new_text);
339
340 $self->{'num_processed_bytes'} += length ($new_text);
341 $text .= "$new_text";
342 } # foreach field
343
344 $section = $doc_obj->get_next_section($section);
345 } # while defined section
346 print $handle "$text\n$documentendtag";
347
348}
349
350#chooses the first two letters or digits for the shortname
351#now ignores non-letdig characters
352sub create_shortname {
353 my $self = shift(@_);
354
355 my ($realname) = @_;
356 #take the first two chars
357 my $shortname;
358 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
359 $shortname = "$1$2";
360 } else {
361 # there aren't two letdig's in the field - try arbitrary combinations
362 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
363 $shortname = "AB";
364 }
365 $shortname =~ tr/a-z/A-Z/;
366
367 #if already used, take the first and third letdigs and so on
368 my $count = 1;
369 while (defined $self->{'indexfieldmap'}->{$shortname}) {
370 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
371 $shortname = "$1$3";
372 $count++;
373 $shortname =~ tr/a-z/A-Z/;
374
375 }
376 else {
377 #remove up to and incl the first letdig
378 $realname =~ s/^[^\w]*\w//;
379 $count = 0;
380 }
381 }
382
383 return $shortname;
384}
385
3861;
387
Note: See TracBrowser for help on using the repository browser.