source: gsdl/trunk/perllib/lucenebuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub text {
66 my $self = shift (@_);
67 my ($doc_obj,$file) = @_;
68 my $handle = $self->{'output_handle'};
69 my $outhandle = $self->{'outhandle'};
70
71 # only output this document if it is one to be indexed
72 return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76 # this is another document
77 $self->{'num_docs'} += 1;
78
79 # get the parameters for the output
80 # split on : just in case there is subcoll and lang stuff
81 my ($fields) = split (/:/, $self->{'index'});
82
83 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85 my $levels = $self->{'levels'};
86 my $ldoc_level = $levels->{'document'};
87 my $lsec_level = $levels->{'section'};
88 my $lpar_level = $levels->{'paragraph'};
89
90 my $gs2_id = "";
91 if ($ldoc_level)
92 {
93 if ($self->{'db_level'} eq 'document')
94 {
95 $gs2_id = $self->{'num_docs'};
96 }
97 else
98 {
99 # default is section level
100 $gs2_id = $self->{'num_sections'} + 1;
101 }
102 }
103 my $gs2_docOID = $doc_obj->get_OID();
104 my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105 my $documentendtag = "\n</$doc_tag_name>\n";
106
107 my $sec_tag_name = "";
108 if ($lsec_level)
109 {
110 $sec_tag_name = $mgppbuildproc::level_map{'section'};
111 }
112 my ($parastarttag) = "";
113 my ($paraendtag) = "";
114 if ($self->{'levels'}->{'paragraph'})
115 {
116 if ($self->{'strip_html'})
117 {
118 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120 }
121 else
122 {
123 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124 }
125 }
126
127 my $doc_section = 0; # just for this document
128
129 my $text = "";
130 $text .= $documenttag;
131 # get the text for this document
132 my $section = $doc_obj->get_top_section();
133 while (defined $section)
134 {
135 # update a few statistics
136 $doc_section++;
137 $self->{'num_sections'}++;
138
139 if ($sec_tag_name ne "")
140 {
141 my $sec_gs2_id = $self->{'num_sections'};
142 my $sec_gs2_docOID = $gs2_docOID . "." . $section;
143 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
144 }
145
146 # if we are doing subcollections, then some docs shouldn't be indexed.
147 # but we need to put the section tag placeholders in there so the
148 # sections match up with database
149 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
150 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
151 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
152 $section = $doc_obj->get_next_section($section);
153 next;
154 }
155
156 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
157 foreach my $field (split (/;/, $fields))
158 {
159 # only deal with this field if it doesn't start with top or
160 # this is the first section
161 my $real_field = $field;
162 next if (($real_field =~ s/^top//) && ($doc_section != 1));
163
164 my $new_text = "";
165 my $tmp_text = "";
166
167 # If allfields is requested add all metadata fields and text as
168 # belonging to the ZZ field
169 if ($real_field eq "allfields") {
170 # Text first - no html nor paragraph tags
171 $new_text .= "$parastarttag<ZZ index=\"1\">\n";
172 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
173 &ghtml::htmlsafe($tmp_text);
174 $new_text .= "$tmp_text</ZZ>$paraendtag\n";
175 # Then Metadata
176 my $metadata = $doc_obj->get_all_metadata ($section);
177 foreach my $pair (@$metadata) {
178 my ($mfield, $mvalue) = (@$pair);
179 &ghtml::htmlsafe($mvalue);
180 # check fields here, maybe others dont want - change to use dontindex!!
181 if ($mfield ne "Identifier"
182 && $mfield !~ /^gsdl/
183 && $mfield ne "classifytype"
184 && $mfield ne "assocfilepath"
185 && defined $mvalue && $mvalue ne "") {
186 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
187 }
188 if (!defined $self->{'indexfields'}->{$mfield}) {
189 $self->{'indexfields'}->{$mfield} = 1;
190 }
191 }
192 }
193 # metadata - output all metadata we know about except gsdl stuff
194 elsif ($real_field eq "metadata" || $real_field eq "allfields") {
195 my $shortname = "";
196 my $metadata = $doc_obj->get_all_metadata ($section);
197 foreach my $pair (@$metadata) {
198 my ($mfield, $mvalue) = (@$pair);
199 &ghtml::htmlsafe($mvalue);
200 # check fields here, maybe others dont want - change to use dontindex!!
201 if ($mfield ne "Identifier"
202 && $mfield !~ /^gsdl/
203 && $mfield ne "classifytype"
204 && $mfield ne "assocfilepath"
205 && defined $mvalue && $mvalue ne "") {
206
207 if (defined $self->{'indexfieldmap'}->{$mfield}) {
208 $shortname = $self->{'indexfieldmap'}->{$mfield};
209 }
210 else {
211 $shortname = $self->create_shortname($mfield);
212 $self->{'indexfieldmap'}->{$mfield} = $shortname;
213 $self->{'indexfieldmap'}->{$shortname} = 1;
214 }
215 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
216 if (!defined $self->{'indexfields'}->{$mfield}) {
217 $self->{'indexfields'}->{$mfield} = 1;
218 }
219 }
220 }
221 }
222 else {
223 #individual metadata and or text specified - could be a comma separated list
224 my $shortname="";
225 if (defined $self->{'indexfieldmap'}->{$real_field}) {
226 $shortname = $self->{'indexfieldmap'}->{$real_field};
227 }
228 else {
229 $shortname = $self->create_shortname($real_field);
230 $self->{'indexfieldmap'}->{$real_field} = $shortname;
231 $self->{'indexfieldmap'}->{$shortname} = 1;
232 }
233
234 my @metadata_list = ();
235 foreach my $submeta (split /,/, $real_field) {
236 if ($submeta eq "text") {
237 my $section_text = $doc_obj->get_text($section);
238 if ($self->{'indexing_text'}) {
239 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
240 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
241 if ($parastarttag ne "") {
242 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
243 }
244 else {
245 # we don't want to individually tag each paragraph if not doing para indexing
246 $section_text = $self->preprocess_text($section_text, 1, "");
247 }
248 $new_text .= "$section_text</$shortname>$paraendtag\n";
249 }
250 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251 $tmp_text .= $doc_obj->get_text ($section);
252 &ghtml::htmlsafe($tmp_text);
253 $new_text .= $tmp_text;
254 }
255 }
256 else {
257 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
258 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
259 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
260 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
261 }
262 }
263 push (@metadata_list, @section_metadata);
264 }
265 }
266 foreach my $item (@metadata_list) {
267 &ghtml::htmlsafe($item);
268 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269 }
270 }
271 # filter the text
272 $new_text = $self->filter_text ($field, $new_text);
273 $self->{'num_processed_bytes'} += length ($new_text);
274
275 $text .= "$new_text";
276 } # foreach field
277
278 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
279
280 $section = $doc_obj->get_next_section($section);
281 } #while defined section
282 print $handle "$text\n$documentendtag";
283 #print STDOUT "$text\n$documentendtag";
284}
285
286# /** We make this builder pretend to be a document processor so we can get
287# * information back from the plugins.
288# *
289# * @param $self A reference to this Lucene builder
290# * @param $doc_obj A reference to a document object representing what was
291# * parsed by the GAPlug
292# * @param $file The name of the file parsed as a string
293# *
294# * @author John Thompson, DL Consulting Ltd
295# */
296sub process()
297 {
298 my $self = shift (@_);
299 my ($doc_obj, $file) = @_;
300
301 # If this is called from any stage other than an incremental infodb we want
302 # to pass through to the superclass of build
303 if ($self->get_mode() eq "incinfodb")
304 {
305 print STDERR "*** Processing a document added using INCINFODB ***\n";
306 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
307 $archivedir = "" unless defined $archivedir;
308 $archivedir =~ s/\\/\//g;
309 $archivedir =~ s/^\/+//;
310 $archivedir =~ s/\/+$//;
311
312 # Number of files
313 print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315 # resolve the final filenames of the files associated with this document
316 $self->assoc_files ($doc_obj, $archivedir);
317
318 # is this a paged or a hierarchical document
319 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321 # Determine the actual docnum by checking if we've processed any
322 # previous incrementally added documents. If so, carry on from there.
323 # Otherwise we set the counter to be the same as the number of
324 # sections encountered during the previous build
325 if ($self->{'numincdocs'} == 0)
326 {
327 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328 }
329
330 my $section = $doc_obj->get_top_section ();
331 print STDERR "+ top section: '$section'\n";
332 my $doc_OID = $doc_obj->get_OID();
333 my $url = "";
334 while (defined $section)
335 {
336 print STDERR "+ processing section: '$section'\n";
337 # Attach all the other metadata to this document
338 # output the fact that this document is a document (unless doctype
339 # has been set to something else from within a plugin
340 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341 if (!defined $dtype || $dtype !~ /\w/)
342 {
343 $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344 }
345 # output whether this node contains text
346 if ($doc_obj->get_text_length($section) > 0)
347 {
348 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
349 }
350 else
351 {
352 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
353 }
354
355 # output archivedir if at top level
356 if ($section eq $doc_obj->get_top_section())
357 {
358 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360 }
361
362 # output a list of children
363 my $children = $doc_obj->get_children ($section);
364 if (scalar(@$children) > 0)
365 {
366 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367 my @contains = ();
368 foreach my $child (@$children)
369 {
370 if ($child =~ /^.*?\.(\d+)$/)
371 {
372 push (@contains, "\".$1");
373 }
374 else
375 {
376 push (@contains, "\".$child");
377 }
378 }
379 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380 }
381 #output the matching doc number
382 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385 $self->{'numincdocs'}++;
386 $section = $doc_obj->get_next_section($section);
387 # if no sections wanted, only add the docs
388 last if ($self->{'db_level'} eq "document");
389 }
390 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392 }
393 else
394 {
395 $self->mgppbuildproc::process(@_);
396 }
397 }
398# /** process() **/
399
400
401# Following methods seem to be no different to those defined in basebuildproc.pm
402# From inspection, it looks like these ones can be removed
403
404
405sub get_num_docs {
406 my $self = shift (@_);
407 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408 return $self->{'num_docs'};
409}
410
411sub get_num_sections {
412 my $self = shift (@_);
413 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414 return $self->{'num_sections'};
415}
416
417# num_bytes is the actual number of bytes in the collection
418# this is normally the same as what's processed during text compression
419sub get_num_bytes {
420 my $self = shift (@_);
421 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422 return $self->{'num_bytes'};
423}
424
425
426# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427# Otherwise the removal of tags below might lead to Lucene turning
428# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429# (example from demo collection b20cre)
430# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431sub preprocess_text
432{
433 my $self = shift (@_);
434 my ($text, $strip_html, $para) = @_;
435 # at this stage, we do not do paragraph tags unless have strip_html -
436 # it will result in a huge mess of non-xml
437 return unless $strip_html;
438
439 my $new_text = $text;
440
441 # if we have <pre> tags, we can have < > inside them, need to delete
442 # the <> before stripping tags
443 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445 if ($para eq "") {
446 # just remove all tags
447 $new_text =~ s/<[^>]*>/ /gs;
448 } else {
449 # strip all tags except <p> tags which get turned into $para
450 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451 }
452
453 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
455 $new_text =~ s/&\w{1,10};//g;
456 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457 $new_text =~ s/&([^\#])/ $1/g;
458
459 return $new_text;
460}
461
462
4631;
464
Note: See TracBrowser for help on using the repository browser.