source: gsdl/trunk/perllib/lucenebuildproc.pm@ 19617

Last change on this file since 19617 was 18471, checked in by davidb, 15 years ago

Changed from using "reindex" to "update"

  • Property svn:keywords set to Author Date Id Revision
File size: 17.7 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub textedit {
66 my $self = shift (@_);
67 my ($doc_obj,$file,$edit_mode) = @_;
68
69 my $lucenehandle = $self->{'output_handle'};
70 my $outhandle = $self->{'outhandle'};
71
72 # only output this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 # skip this document if in "compress-text" mode and asked to delete it
76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80 # this is another document
81 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
82 $self->{'num_docs'} += 1;
83 }
84 else {
85 $self->{'num_docs'} -= 1;
86 }
87
88 # get the parameters for the output
89 # split on : just in case there is subcoll and lang stuff
90 my ($fields) = split (/:/, $self->{'index'});
91
92 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94 my $levels = $self->{'levels'};
95 my $ldoc_level = $levels->{'document'};
96 my $lsec_level = $levels->{'section'};
97
98 # gs2_id should be depricated #####
99 my $gs2_id = "";
100 if ($ldoc_level)
101 {
102 if ($self->{'db_level'} eq 'document')
103 {
104 $gs2_id = $self->{'num_docs'};
105 }
106 else
107 {
108 # default is section level
109 $gs2_id = $self->{'num_sections'} + 1;
110 }
111 }
112 my $gs2_docOID = $doc_obj->get_OID();
113 my $documenttag = undef;
114 my $documentendtag = undef;
115
116 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117 $documentendtag = "\n</$doc_tag_name>\n";
118
119 my $sec_tag_name = "";
120 if ($lsec_level)
121 {
122 $sec_tag_name = $mgppbuildproc::level_map{'section'};
123 }
124
125 my $doc_section = 0; # just for this document
126
127 my $text = "";
128 $text .= $documenttag;
129 # get the text for this document
130 my $section = $doc_obj->get_top_section();
131 while (defined $section)
132 {
133 # update a few statistics
134 $doc_section++;
135 $self->{'num_sections'}++;
136
137 my $sec_gs2_id = $self->{'num_sections'};
138 my $sec_gs2_docOID = $gs2_docOID;
139 $sec_gs2_docOID .= ".$section" if ($section ne "");
140
141 # if we are doing subcollections, then some docs shouldn't be indexed.
142 # but we need to put the section tag placeholders in there so the
143 # sections match up with database
144 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
145 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
146 if ($sec_tag_name ne "") {
147 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
148 $text .= "\n</$sec_tag_name>\n"
149 }
150 $section = $doc_obj->get_next_section($section);
151 next;
152 }
153
154 if ($sec_tag_name ne "")
155 {
156 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
157 }
158
159 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
160 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
161 }
162 else {
163 # delete
164 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
165 }
166
167
168 # has the user added a 'metadata' index?
169 my $all_metadata_specified = 0;
170 # which fields have already been indexed? (same as fields, but in a map)
171 my $specified_fields = {};
172
173 # do we have an allfields index??
174 my $allfields_index = 0;
175 # collect up all the text for it in here
176 my $allfields_text = "";
177 foreach my $field (split (/;/, $fields)) {
178 if ($field eq "allfields") {
179 $allfields_index = 1;
180 } elsif ($field eq "metadata") {
181 $all_metadata_specified = 1;
182 }
183 }
184
185 foreach my $field (split (/;/, $fields)) {
186
187 # only deal with this field if it doesn't start with top or
188 # this is the first section
189 my $real_field = $field;
190 next if (($real_field =~ s/^top//) && ($doc_section != 1));
191
192 # process these two later
193 next if ($real_field eq "allfields" || $real_field eq "metadata");
194
195 #individual metadata and or text specified - could be a comma separated list
196 $specified_fields->{$real_field} = 1;
197 my $shortname="";
198 my $new_field = 0; # have we found a new field name?
199 if (defined $self->{'indexfieldmap'}->{$real_field}) {
200 $shortname = $self->{'indexfieldmap'}->{$real_field};
201 }
202 else {
203 $shortname = $self->create_shortname($real_field);
204 $new_field = 1;
205 }
206
207 my @metadata_list = (); # put any metadata values in here
208 my $section_text = ""; # put the text in here
209 foreach my $submeta (split /,/, $real_field) {
210 if ($submeta eq "text") {
211 # no point in indexing text more than once
212 if ($section_text eq "") {
213 $section_text = $doc_obj->get_text($section);
214 if ($self->{'indexing_text'}) {
215 # we always strip html
216 $section_text = $self->preprocess_text($section_text, 1, "");
217 }
218 else {
219 # leave html stuff in, but escape the tags
220 &ghtml::htmlsafe($section_text);
221 }
222 }
223 }
224 else {
225 # its a metadata element
226 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
227 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
228 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
229 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
230 }
231 }
232 push (@metadata_list, @section_metadata);
233 }
234 } # for each field in this one index
235
236 # now we add the text and/or metadata into new_text
237 if ($section_text ne "" || scalar(@metadata_list)) {
238 my $new_text = "";
239
240 if ($section_text ne "") {
241 $new_text .= "$section_text ";
242 }
243
244 foreach my $item (@metadata_list) {
245 &ghtml::htmlsafe($item);
246 $new_text .= "$item ";
247 }
248
249 if ($allfields_index) {
250 $allfields_text .= $new_text;
251 }
252
253 if ($self->{'indexing_text'}) {
254 # add the tag
255 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
256 }
257 # filter the text
258 $new_text = $self->filter_text ($field, $new_text);
259
260 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
261 $self->{'num_processed_bytes'} += length ($new_text);
262 $text .= "$new_text";
263 }
264 else {
265 # delete
266 $self->{'num_processed_bytes'} -= length ($new_text);
267 }
268
269
270 if ($self->{'indexing_text'} && $new_field) {
271 # we need to add to the list in indexfields
272
273 $self->{'indexfieldmap'}->{$real_field} = $shortname;
274 $self->{'indexfieldmap'}->{$shortname} = 1;
275 }
276
277 }
278
279 } # foreach field
280
281
282 if ($all_metadata_specified) {
283
284 my $new_text = "";
285 my $shortname = "";
286 my $metadata = $doc_obj->get_all_metadata ($section);
287 foreach my $pair (@$metadata) {
288 my ($mfield, $mvalue) = (@$pair);
289 # no value
290 next unless defined $mvalue && $mvalue ne "";
291 # we have already indexed this
292 next if defined ($specified_fields->{$mfield});
293 # check fields here, maybe others dont want - change to use dontindex!!
294 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
295 next if ($mfield =~ /^gsdl/);
296
297 &ghtml::htmlsafe($mvalue);
298
299 if (defined $self->{'indexfieldmap'}->{$mfield}) {
300 $shortname = $self->{'indexfieldmap'}->{$mfield};
301 }
302 else {
303 $shortname = $self->create_shortname($mfield);
304 $self->{'indexfieldmap'}->{$mfield} = $shortname;
305 $self->{'indexfieldmap'}->{$shortname} = 1;
306 }
307 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
308 if ($allfields_index) {
309 $allfields_text .= "$mvalue ";
310 }
311
312 if (!defined $self->{'indexfields'}->{$mfield}) {
313 $self->{'indexfields'}->{$mfield} = 1;
314 }
315
316 }
317 # filter the text
318 $new_text = $self->filter_text ("metadata", $new_text);
319
320 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
321 $self->{'num_processed_bytes'} += length ($new_text);
322 $text .= "$new_text";
323 }
324 else {
325 # delete
326 $self->{'num_processed_bytes'} -= length ($new_text);
327 }
328 }
329
330 if ($allfields_index) {
331 # add the index name mapping
332 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
333 $self->{'indexfieldmap'}->{"ZZ"} = 1;
334
335 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
336 # filter the text
337 $new_text = $self->filter_text ("allfields", $new_text);
338
339 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
340 $self->{'num_processed_bytes'} += length ($new_text);
341 $text .= "$new_text";
342 }
343 else {
344 # delete
345 $self->{'num_processed_bytes'} -= length ($new_text);
346 }
347 }
348
349 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
350
351 $section = $doc_obj->get_next_section($section);
352 } # while defined section
353
354 print $lucenehandle "$text\n$documentendtag";
355
356## if ($edit_mode eq "delete") {
357## print STDERR "$text\n$documentendtag";
358## }
359
360}
361
362sub text {
363 my $self = shift (@_);
364 my ($doc_obj,$file) = @_;
365
366 $self->textedit($doc_obj,$file,"add");
367}
368
369sub textreindex
370{
371 my $self = shift (@_);
372 my ($doc_obj,$file) = @_;
373
374 $self->textedit($doc_obj,$file,"update");
375}
376
377sub textdelete
378{
379 my $self = shift (@_);
380 my ($doc_obj,$file) = @_;
381
382 $self->textedit($doc_obj,$file,"delete");
383}
384
385
386
387
388
389# /** We make this builder pretend to be a document processor so we can get
390# * information back from the plugins.
391# *
392# * @param $self A reference to this Lucene builder
393# * @param $doc_obj A reference to a document object representing what was
394# * parsed by the GAPlug
395# * @param $file The name of the file parsed as a string
396# *
397# * @author John Thompson, DL Consulting Ltd
398# */
399sub process()
400 {
401 my $self = shift (@_);
402 my ($doc_obj, $file) = @_;
403
404 # If this is called from any stage other than an incremental infodb we want
405 # to pass through to the superclass of build
406 if ($self->get_mode() eq "incinfodb")
407 {
408 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
409 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
410 $archivedir = "" unless defined $archivedir;
411 $archivedir =~ s/\\/\//g;
412 $archivedir =~ s/^\/+//;
413 $archivedir =~ s/\/+$//;
414
415 # Number of files
416 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
417
418 # resolve the final filenames of the files associated with this document
419 $self->assoc_files ($doc_obj, $archivedir);
420
421 # is this a paged or a hierarchical document
422 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
423
424 # Determine the actual docnum by checking if we've processed any
425 # previous incrementally added documents. If so, carry on from there.
426 # Otherwise we set the counter to be the same as the number of
427 # sections encountered during the previous build
428 if ($self->{'numincdocs'} == 0)
429 {
430 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
431 }
432
433 my $section = $doc_obj->get_top_section ();
434 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
435 my $doc_OID = $doc_obj->get_OID();
436 my $url = "";
437 while (defined $section)
438 {
439 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
440 # Attach all the other metadata to this document
441 # output the fact that this document is a document (unless doctype
442 # has been set to something else from within a plugin
443 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
444 if (!defined $dtype || $dtype !~ /\w/)
445 {
446 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
447 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
448 }
449 # output whether this node contains text
450 if ($doc_obj->get_text_length($section) > 0)
451 {
452 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
453 }
454 else
455 {
456 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
457 }
458
459 # output archivedir if at top level
460 if ($section eq $doc_obj->get_top_section())
461 {
462 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
463 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
464 }
465
466 # output a list of children
467 my $children = $doc_obj->get_children ($section);
468 if (scalar(@$children) > 0)
469 {
470 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
471 my @contains = ();
472 foreach my $child (@$children)
473 {
474 if ($child =~ /^.*?\.(\d+)$/)
475 {
476 push (@contains, "\".$1");
477 }
478 else
479 {
480 push (@contains, "\".$child");
481 }
482 }
483 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
484 }
485 #output the matching doc number
486 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
487 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
488
489 $self->{'numincdocs'}++;
490 $section = $doc_obj->get_next_section($section);
491 # if no sections wanted, only add the docs
492 last if ($self->{'db_level'} eq "document");
493 }
494 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
495 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
496 }
497 else
498 {
499 $self->mgppbuildproc::process(@_);
500 }
501 }
502# /** process() **/
503
504
505# Following methods seem to be no different to those defined in basebuildproc.pm
506# From inspection, it looks like these ones can be removed
507
508
509sub get_num_docs {
510 my $self = shift (@_);
511 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
512 return $self->{'num_docs'};
513}
514
515sub get_num_sections {
516 my $self = shift (@_);
517 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
518 return $self->{'num_sections'};
519}
520
521# num_bytes is the actual number of bytes in the collection
522# this is normally the same as what's processed during text compression
523sub get_num_bytes {
524 my $self = shift (@_);
525 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
526 return $self->{'num_bytes'};
527}
528
529
530# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
531# Otherwise the removal of tags below might lead to Lucene turning
532# "...farming</p>\n<p>EDWARD.." into "farmingedward"
533# (example from demo collection b20cre)
534# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
535sub preprocess_text
536{
537 my $self = shift (@_);
538 my ($text, $strip_html, $para) = @_;
539 # at this stage, we do not do paragraph tags unless have strip_html -
540 # it will result in a huge mess of non-xml
541 return unless $strip_html;
542
543 my $new_text = $text;
544
545 # if we have <pre> tags, we can have < > inside them, need to delete
546 # the <> before stripping tags
547 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
548
549 if ($para eq "") {
550 # just remove all tags
551 $new_text =~ s/<[^>]*>/ /gs;
552 } else {
553 # strip all tags except <p> tags which get turned into $para
554 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
555 }
556
557 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
558 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
559 $new_text =~ s/&\w{1,10};//g;
560 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
561 $new_text =~ s/&([^\#])/ $1/g;
562
563 return $new_text;
564}
565
566
5671;
568
569
Note: See TracBrowser for help on using the repository browser.