source: gsdl/trunk/perllib/lucenebuildproc.pm@ 16506

Last change on this file since 16506 was 16506, checked in by mdewsnip, 16 years ago

Now adds gs2:docOID attributes into "<Sec>" tags as well, to prevent errors when indexing at section level.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub text {
66 my $self = shift (@_);
67 my ($doc_obj,$file) = @_;
68 my $handle = $self->{'output_handle'};
69 my $outhandle = $self->{'outhandle'};
70
71 # only output this document if it is one to be indexed
72 return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76 # this is another document
77 $self->{'num_docs'} += 1;
78
79 # get the parameters for the output
80 # split on : just in case there is subcoll and lang stuff
81 my ($fields) = split (/:/, $self->{'index'});
82
83 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85 my $levels = $self->{'levels'};
86 my $ldoc_level = $levels->{'document'};
87 my $lsec_level = $levels->{'section'};
88 my $lpar_level = $levels->{'paragraph'};
89
90 my $gs2_id = "";
91 if ($ldoc_level)
92 {
93 if ($self->{'db_level'} eq 'document')
94 {
95 $gs2_id = $self->{'num_docs'};
96 }
97 else
98 {
99 # default is section level
100 $gs2_id = $self->{'num_sections'} + 1;
101 }
102 }
103 my $gs2_docOID = $doc_obj->get_OID();
104 my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105 my $documentendtag = "\n</$doc_tag_name>\n";
106
107 my $sec_tag_name = "";
108 if ($lsec_level)
109 {
110 $sec_tag_name = $mgppbuildproc::level_map{'section'};
111 }
112 my ($parastarttag) = "";
113 my ($paraendtag) = "";
114 if ($self->{'levels'}->{'paragraph'})
115 {
116 if ($self->{'strip_html'})
117 {
118 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120 }
121 else
122 {
123 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124 }
125 }
126
127 my $doc_section = 0; # just for this document
128
129 my $text = "";
130 $text .= $documenttag;
131 # get the text for this document
132 my $section = $doc_obj->get_top_section();
133 while (defined $section)
134 {
135 # update a few statistics
136 $doc_section++;
137 $self->{'num_sections'}++;
138
139 if ($sec_tag_name ne "")
140 {
141 my $sec_gs2_id = $self->{'num_sections'};
142 my $sec_gs2_docOID = $gs2_docOID . "." . $section;
143 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
144 }
145
146 # if we are doing subcollections, then some docs shouldn't be indexed.
147 # but we need to put the section tag placeholders in there so the
148 # sections match up with database
149 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
150 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
151 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
152 $section = $doc_obj->get_next_section($section);
153 next;
154 }
155
156 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
157 foreach my $field (split (/;/, $fields))
158 {
159 # only deal with this field if it doesn't start with top or
160 # this is the first section
161 my $real_field = $field;
162 next if (($real_field =~ s/^top//) && ($doc_section != 1));
163
164 my $new_text = "";
165 my $tmp_text = "";
166
167 # If allfields is requested add all metadata fields and text as
168 # belonging to the ZZ field
169 if ($real_field eq "allfields") {
170 # Text first - no html nor paragraph tags
171 $new_text .= "$parastarttag<ZZ index=\"1\">\n";
172 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
173 &ghtml::htmlsafe($tmp_text);
174 $new_text .= "$tmp_text</ZZ>$paraendtag\n";
175 # Then Metadata
176 my $metadata = $doc_obj->get_all_metadata ($section);
177 foreach my $pair (@$metadata) {
178 my ($mfield, $mvalue) = (@$pair);
179 &ghtml::htmlsafe($mvalue);
180 # check fields here, maybe others dont want - change to use dontindex!!
181 if ($mfield ne "Identifier"
182 && $mfield !~ /^gsdl/
183 && $mfield ne "classifytype"
184 && $mfield ne "assocfilepath"
185 && defined $mvalue && $mvalue ne "") {
186 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
187 }
188 if (!defined $self->{'indexfields'}->{$mfield}) {
189 $self->{'indexfields'}->{$mfield} = 1;
190 }
191 }
192 }
193 # metadata - output all metadata we know about except gsdl stuff
194 elsif ($real_field eq "metadata" || $real_field eq "allfields") {
195 my $shortname = "";
196 my $metadata = $doc_obj->get_all_metadata ($section);
197 foreach my $pair (@$metadata) {
198 my ($mfield, $mvalue) = (@$pair);
199 &ghtml::htmlsafe($mvalue);
200 # check fields here, maybe others dont want - change to use dontindex!!
201 if ($mfield ne "Identifier"
202 && $mfield !~ /^gsdl/
203 && $mfield ne "classifytype"
204 && $mfield ne "assocfilepath"
205 && defined $mvalue && $mvalue ne "") {
206
207 if (defined $self->{'indexfieldmap'}->{$mfield}) {
208 $shortname = $self->{'indexfieldmap'}->{$mfield};
209 }
210 else {
211 $shortname = $self->create_shortname($mfield);
212 $self->{'indexfieldmap'}->{$mfield} = $shortname;
213 $self->{'indexfieldmap'}->{$shortname} = 1;
214 }
215 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
216 if (!defined $self->{'indexfields'}->{$mfield}) {
217 $self->{'indexfields'}->{$mfield} = 1;
218 }
219 }
220 }
221 }
222 else {
223 #individual metadata and or text specified - could be a comma separated list
224 my $shortname="";
225 if (defined $self->{'indexfieldmap'}->{$real_field}) {
226 $shortname = $self->{'indexfieldmap'}->{$real_field};
227 }
228 else {
229 $shortname = $self->create_shortname($real_field);
230 $self->{'indexfieldmap'}->{$real_field} = $shortname;
231 $self->{'indexfieldmap'}->{$shortname} = 1;
232 }
233
234 my @metadata_list = ();
235 foreach my $submeta (split /,/, $real_field) {
236 if ($submeta eq "text") {
237 my $section_text = $doc_obj->get_text($section);
238 if ($self->{'indexing_text'}) {
239 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
240 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
241 if ($parastarttag ne "") {
242 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
243 }
244 else {
245 # we don't want to individually tag each paragraph if not doing para indexing
246 $section_text = $self->preprocess_text($section_text, 1, "");
247 }
248 $new_text .= "$section_text</$shortname>$paraendtag\n";
249 }
250 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251 $tmp_text .= $doc_obj->get_text ($section);
252 &ghtml::htmlsafe($tmp_text);
253 $new_text .= $tmp_text;
254 }
255 }
256 else {
257 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
258 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
259 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
260 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
261 }
262 }
263 push (@metadata_list, @section_metadata);
264 }
265 }
266 foreach my $item (@metadata_list) {
267 &ghtml::htmlsafe($item);
268 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269 }
270 }
271 # filter the text
272 $self->filter_text ($field, $new_text);
273 $self->{'num_processed_bytes'} += length ($new_text);
274
275 $text .= "$new_text";
276 } # foreach field
277
278 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
279
280 $section = $doc_obj->get_next_section($section);
281 } #while defined section
282 print $handle "$text\n$documentendtag";
283 #print STDOUT "$text\n$documentendtag";
284}
285
286# /** We make this builder pretend to be a document processor so we can get
287# * information back from the plugins.
288# *
289# * @param $self A reference to this Lucene builder
290# * @param $doc_obj A reference to a document object representing what was
291# * parsed by the GAPlug
292# * @param $file The name of the file parsed as a string
293# *
294# * @author John Thompson, DL Consulting Ltd
295# */
296sub process()
297 {
298 my $self = shift (@_);
299 my ($doc_obj, $file) = @_;
300
301 # If this is called from any stage other than an incremental infodb we want
302 # to pass through to the superclass of build
303 if ($self->get_mode() eq "incinfodb")
304 {
305 print STDERR "*** Processing a document added using INCINFODB ***\n";
306 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
307 $archivedir = "" unless defined $archivedir;
308 $archivedir =~ s/\\/\//g;
309 $archivedir =~ s/^\/+//;
310 $archivedir =~ s/\/+$//;
311
312 # Number of files
313 print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315 # resolve the final filenames of the files associated with this document
316 $self->assoc_files ($doc_obj, $archivedir);
317
318 # is this a paged or a hierarchical document
319 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321 # Determine the actual docnum by checking if we've processed any
322 # previous incrementally added documents. If so, carry on from there.
323 # Otherwise we set the counter to be the same as the number of
324 # sections encountered during the previous build
325 if ($self->{'numincdocs'} == 0)
326 {
327 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328 }
329
330 my $section = $doc_obj->get_top_section ();
331 print STDERR "+ top section: '$section'\n";
332 my $doc_OID = $doc_obj->get_OID();
333 my $url = "";
334 while (defined $section)
335 {
336 print STDERR "+ processing section: '$section'\n";
337 # Attach all the other metadata to this document
338 # output the fact that this document is a document (unless doctype
339 # has been set to something else from within a plugin
340 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341 if (!defined $dtype || $dtype !~ /\w/)
342 {
343 $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344 }
345 # output whether this node contains text
346 if ($doc_obj->get_text_length($section) > 0)
347 {
348 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
349 }
350 else
351 {
352 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
353 }
354
355 # output archivedir if at top level
356 if ($section eq $doc_obj->get_top_section())
357 {
358 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360 }
361
362 # output a list of children
363 my $children = $doc_obj->get_children ($section);
364 if (scalar(@$children) > 0)
365 {
366 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367 my @contains = ();
368 foreach my $child (@$children)
369 {
370 if ($child =~ /^.*?\.(\d+)$/)
371 {
372 push (@contains, "\".$1");
373 }
374 else
375 {
376 push (@contains, "\".$child");
377 }
378 }
379 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380 }
381 #output the matching doc number
382 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385 $self->{'numincdocs'}++;
386 $section = $doc_obj->get_next_section($section);
387 # if no sections wanted, only add the docs
388 last if ($self->{'db_level'} eq "document");
389 }
390 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392 }
393 else
394 {
395 $self->mgppbuildproc::process(@_);
396 }
397 }
398# /** process() **/
399
400
401# Following methods seem to be no different to those defined in basebuildproc.pm
402# From inspection, it looks like these ones can be removed
403
404
405sub get_num_docs {
406 my $self = shift (@_);
407 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408 return $self->{'num_docs'};
409}
410
411sub get_num_sections {
412 my $self = shift (@_);
413 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414 return $self->{'num_sections'};
415}
416
417# num_bytes is the actual number of bytes in the collection
418# this is normally the same as what's processed during text compression
419sub get_num_bytes {
420 my $self = shift (@_);
421 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422 return $self->{'num_bytes'};
423}
424
425
426# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427# Otherwise the removal of tags below might lead to Lucene turning
428# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429# (example from demo collection b20cre)
430# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431sub preprocess_text
432{
433 my $self = shift (@_);
434 my ($text, $strip_html, $para) = @_;
435 # at this stage, we do not do paragraph tags unless have strip_html -
436 # it will result in a huge mess of non-xml
437 return unless $strip_html;
438
439 my $new_text = $text;
440
441 # if we have <pre> tags, we can have < > inside them, need to delete
442 # the <> before stripping tags
443 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445 if ($para eq "") {
446 # just remove all tags
447 $new_text =~ s/<[^>]*>/ /gs;
448 } else {
449 # strip all tags except <p> tags which get turned into $para
450 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451 }
452
453 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
455 $new_text =~ s/&\w{1,10};//g;
456 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457 $new_text =~ s/&([^\#])/ $1/g;
458
459 return $new_text;
460}
461
462
4631;
464
Note: See TracBrowser for help on using the repository browser.