source: trunk/gsdl/perllib/lucenebuildproc.pm@ 10304

Last change on this file since 10304 was 10304, checked in by davidb, 19 years ago

Introduction of is_incremental() function in base class (not incremental
by default). Over-ridden by inherited classes if incremental building
possible.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.1 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35
36sub BEGIN {
37 @lucenebuildproc::ISA = ('mgppbuildproc');
38}
39
40
41sub new {
42 my $class = shift @_;
43 my $self = new mgppbuildproc (@_);
44
45 return bless $self, $class;
46}
47
48
49sub is_incremental
50{
51 my $self = shift (@_);
52
53 # Unlike MG and MGPP, Lucene supports incremental building
54 return 1;
55}
56
57
58sub preprocess_text {
59 my $self = shift (@_);
60 my ($text, $strip_html, $para) = @_;
61
62 my ($outtext) = "";
63 if ($strip_html) {
64 while ($text =~ /<([^>]*)>/ && $text ne "") {
65
66 my $tag = $1;
67 $outtext .= $`." "; #add everything before the matched tag
68 $text = $'; #'everything after the matched tag
69 if ($para && $tag =~ /^\s*p\s/i) {
70 $outtext .= $para;
71 }
72 elsif ($tag =~ /^pre$/) { # a pre tag
73 $text =~ /<\/pre>/; # find the closing pre tag
74 my $tmp_text = $`; #everything before the closing pre tag
75 $text = $'; #'everything after the </pre>
76 $tmp_text =~ s/[<>]//g; # remove all < and >
77 $outtext.= $tmp_text . " ";
78 }
79 }
80
81 $outtext .= $text; # add any remaining text
82 } #if strip_html
83 else {
84 $outtext = $text;
85 }
86 #if ($para) {
87 #$text =~ s/(<p\b)/$para$1/gi;
88 #return $text;
89 #}
90
91 # remove entities
92 $outtext =~ s/&\w{1,10};//g;
93
94 return $outtext;
95}
96
97sub text {
98 my $self = shift (@_);
99 my ($doc_obj,$file) = @_;
100 my $handle = $self->{'output_handle'};
101 my $outhandle = $self->{'outhandle'};
102 my $indexed_doc = 1;
103
104 # only output this document if it is one to be indexed
105 return if ($doc_obj->get_doc_type() ne "indexed_doc");
106
107 # see if this document belongs to this subcollection
108 foreach my $indexexp (@{$self->{'indexexparr'}}) {
109 $indexed_doc = 0;
110 my ($field, $exp, $options) = split /\//, $indexexp;
111 if (defined ($field) && defined ($exp)) {
112 my ($bool) = $field =~ /^(.)/;
113 $field =~ s/^.// if $bool eq '!';
114 if ($field =~ /^filename$/i) {
115 $field = $doc_obj->get_source_filename();
116 } else {
117 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
118 }
119 next unless defined $field;
120 if ($bool eq '!') {
121 if ($options =~ /^i$/i) {
122 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
123 } else {
124 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
125 }
126 } else {
127 if ($options =~ /^i$/i) {
128 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
129 } else {
130 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
131 }
132 }
133 }
134 }
135
136 # if this doc is so far in the sub collection, and we have lang info,
137 # now we check the languages to see if it matches
138 if($indexed_doc && defined $self->{'lang_meta'}) {
139 $indexed_doc = 0;
140 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
141 if (defined $field) {
142 foreach my $lang (@{$self->{'langarr'}}) {
143 my ($bool) = $lang =~ /^(.)/;
144 if ($bool eq '!') {
145 $lang =~ s/^.//;
146 if ($field !~ /$lang/) {
147 $indexed_doc = 1; last;
148 }
149 } else {
150 if ($field =~ /$lang/) {
151 $indexed_doc = 1; last;
152 }
153 }
154 }
155 }
156 }
157
158 # this is another document
159 $self->{'num_docs'} += 1;
160
161 # get the parameters for the output
162 # split on : just in case there is subcoll and lang stuff
163 my ($fields) = split (/:/, $self->{'index'});
164
165 my $doc_level = $mgppbuildproc::level_map{'document'};
166 my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
167
168 my $levels = $self->{'levels'};
169 my $ldoc_level = $levels->{'document'};
170 my $lsec_level = $levels->{'section'};
171 my $lpar_level = $levels->{'paragraph'};
172
173 my $docid="";
174 if ($ldoc_level) {
175 if ($self->{'gdbm_level'} eq 'document') {
176 my $doc_sec_num = $self->{'num_docs'};
177 $docid = "gs2:id=\"$doc_sec_num\"";
178 } else {
179 # default is section level
180 my $doc_sec_num = $self->{'num_sections'}+1;
181 $docid = "gs2:id=\"$doc_sec_num\"";
182 }
183 }
184 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
185 my $documentendtag = "\n</$doc_level>\n";
186
187 my ($sectiontag) = "";
188 if ($lsec_level) {
189 $sectiontag = $mgppbuildproc::level_map{'section'};
190 }
191 my ($parastarttag) = "";
192 my ($paraendtag) = "";
193 if ($self->{'levels'}->{'paragraph'}) {
194 if ($self->{'strip_html'}) {
195 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
196 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
197 } else {
198 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
199 }
200 }
201
202 my $doc_section = 0; # just for this document
203
204 my $text = "";
205 $text .= $documenttag;
206 # get the text for this document
207 my $section = $doc_obj->get_top_section();
208 while (defined $section) {
209 # update a few statistics
210 $doc_section++;
211 $self->{'num_sections'} += 1;
212
213 if ($sectiontag ne "") {
214 my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
215 $text .= "\n<$sectiontag $secid >\n";
216 }
217
218 # if we are doing subcollections, then some docs shouldn't be indexed.
219 # but we need to put the section tag placeholders in there so the
220 # sections match up with gdbm db
221 if ($indexed_doc) {
222 #if ($self->{'indexing_text'}) {
223 # $text .= "$parastarttag"; # only add para tags for indexing
224 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
225 # }
226 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
227 foreach my $field (split (/,/, $fields)) {
228 # only deal with this field if it doesn't start with top or
229 # this is the first section
230 my $real_field = $field;
231 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
232 my $new_text = "";
233 my $tmp_text = "";
234 if ($real_field eq "text") {
235 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
236 $new_text .= "$parastarttag<TX index=\"1\">\n";
237 $tmp_text .= $doc_obj->get_text ($section);
238 if ($parastarttag =~ "") {
239 # we don't want to individually tag each paragraph if not doing para indexing
240 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
241 } else {
242 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paraendtag$parastarttag<TX index=\"1\">");
243 }
244
245 $new_text .= "$tmp_text</TX>$paraendtag\n";
246 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
247 #$self->{'indexfields'}->{'TextOnly'} = 1;
248 #}
249 }
250 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251 if ($self->{'store_text'}) {
252 $tmp_text .= $doc_obj->get_text ($section);
253 &ghtml::htmlsafe($tmp_text);
254 $new_text .= $tmp_text;
255 }
256 }
257 } else { # metadata field
258 if ($real_field eq "allfields") { #ignore
259 }
260 elsif ($real_field eq "metadata") { # insert all metadata
261 #except gsdl stuff
262 my $shortname = "";
263 my $metadata = $doc_obj->get_all_metadata ($section);
264 foreach $pair (@$metadata) {
265 my ($mfield, $mvalue) = (@$pair);
266 # check fields here, maybe others dont want - change to use dontindex!!
267 if ($mfield ne "Identifier"
268 && $mfield !~ /^gsdl/
269 && $mfield ne "classifytype"
270 && $mfield ne "assocfilepath"
271 && defined $mvalue && $mvalue ne "") {
272
273 if (defined $self->{'indexfieldmap'}->{$mfield}) {
274 $shortname = $self->{'indexfieldmap'}->{$mfield};
275 }
276 else {
277 $shortname = $self->create_shortname($mfield);
278 $self->{'indexfieldmap'}->{$mfield} = $shortname;
279 $self->{'indexfieldmap'}->{$shortname} = 1;
280 }
281 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
282 if (!defined $self->{'indexfields'}->{$mfield}) {
283 $self->{'indexfields'}->{$mfield} = 1;
284 }
285 }
286 }
287
288 }
289 else { #individual metadata specified
290 my $shortname="";
291 #if (!defined $self->{'indexfields'}->{$real_field}) {
292 #$self->{'indexfields'}->{$real_field} = 1;
293 #}
294 if (defined $self->{'indexfieldmap'}->{$real_field}) {
295 $shortname = $self->{'indexfieldmap'}->{$real_field};
296 }
297 else {
298 $shortname = $self->create_shortname($real_field);
299 $self->{'indexfieldmap'}->{$real_field} = $shortname;
300 $self->{'indexfieldmap'}->{$shortname} = 1;
301 }
302 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
303 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
304 # remove entities
305 $new_text =~ s/&\w{1,10};//g;
306 }
307 }
308
309 }
310
311 # filter the text
312 $self->filter_text ($field, $new_text);
313 $self->{'num_processed_bytes'} += length ($new_text);
314 $text .= "$new_text";
315 }
316 }
317 } # if (indexed_doc)
318
319 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
320
321 $section = $doc_obj->get_next_section($section);
322 } #while defined section
323 print $handle "$text\n$documentendtag";
324}
325
3261;
327
Note: See TracBrowser for help on using the repository browser.