source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 27329

Last change on this file since 27329 was 27329, checked in by kjdon, 11 years ago

changes to match mgpp's shortname mapping stuff. Changed some util functions to FileUtil functions

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub textedit {
66 my $self = shift (@_);
67 my ($doc_obj,$file,$edit_mode) = @_;
68
69 my $lucenehandle = $self->{'output_handle'};
70 my $outhandle = $self->{'outhandle'};
71
72 # only output this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 # skip this document if in "compress-text" mode and asked to delete it
76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80 # this is another document
81 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
82 $self->{'num_docs'} += 1;
83 }
84 else {
85 $self->{'num_docs'} -= 1;
86 }
87
88 # get the parameters for the output
89 # split on : just in case there is subcoll and lang stuff
90 my ($fields) = split (/:/, $self->{'index'});
91
92 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94 my $levels = $self->{'levels'};
95 my $ldoc_level = $levels->{'document'};
96 my $lsec_level = $levels->{'section'};
97
98 my $gs2_docOID = $doc_obj->get_OID();
99 my $documenttag = undef;
100 my $documentendtag = undef;
101
102 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
103 $documentendtag = "\n</$doc_tag_name>\n";
104
105 my $sec_tag_name = "";
106 if ($lsec_level)
107 {
108 $sec_tag_name = $mgppbuildproc::level_map{'section'};
109 }
110
111 my $doc_section = 0; # just for this document
112
113 my $text = "";
114 $text .= $documenttag;
115 # get the text for this document
116 my $section = $doc_obj->get_top_section();
117 while (defined $section)
118 {
119 # update a few statistics
120 $doc_section++;
121 $self->{'num_sections'}++;
122
123 my $sec_gs2_id = $self->{'num_sections'};
124 my $sec_gs2_docOID = $gs2_docOID;
125 $sec_gs2_docOID .= ".$section" if ($section ne "");
126
127 # if we are doing subcollections, then some docs shouldn't be indexed.
128 # but we need to put the section tag placeholders in there so the
129 # sections match up with database
130 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
131 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
132 if ($sec_tag_name ne "") {
133 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
134 $text .= "\n</$sec_tag_name>\n"
135 }
136 $section = $doc_obj->get_next_section($section);
137 next;
138 }
139
140 if ($sec_tag_name ne "")
141 {
142 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
143 }
144
145 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
146 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
147 }
148 else {
149 # delete
150 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
151 }
152
153
154 # has the user added a 'metadata' index?
155 my $all_metadata_specified = 0;
156 # which fields have already been indexed? (same as fields, but in a map)
157 my $specified_fields = {};
158
159 # do we have an allfields index??
160 my $allfields_index = 0;
161 # collect up all the text for it in here
162 my $allfields_text = "";
163 foreach my $field (split (/;/, $fields)) {
164 if ($field eq "allfields") {
165 $allfields_index = 1;
166 } elsif ($field eq "metadata") {
167 $all_metadata_specified = 1;
168 }
169 }
170
171 foreach my $field (split (/;/, $fields)) {
172
173 # only deal with this field if it doesn't start with top or
174 # this is the first section
175 my $real_field = $field;
176 next if (($real_field =~ s/^top//) && ($doc_section != 1));
177
178 # process these two later
179 next if ($real_field eq "allfields" || $real_field eq "metadata");
180
181 #individual metadata and or text specified - could be a comma separated list
182 $specified_fields->{$real_field} = 1;
183 my $shortname="";
184 my $new_field = 0; # have we found a new field name?
185 if (defined $self->{'fieldnamemap'}->{$real_field}) {
186 $shortname = $self->{'fieldnamemap'}->{$real_field};
187 } else {
188 $shortname = $self->create_shortname($real_field);
189 $self->{'fieldnamemap'}->{$real_field} = $shortname;
190 $self->{'fieldnamemap'}->{$shortname} = 1;
191 }
192 my @metadata_list = (); # put any metadata values in here
193 my $section_text = ""; # put the text in here
194 foreach my $submeta (split /,/, $real_field) {
195 if ($submeta eq "text") {
196 # no point in indexing text more than once
197 if ($section_text eq "") {
198 $section_text = $doc_obj->get_text($section);
199 if ($self->{'indexing_text'}) {
200 # we always strip html
201 $section_text = $self->preprocess_text($section_text, 1, "");
202 }
203 else {
204 # leave html stuff in, but escape the tags
205 &ghtml::htmlsafe($section_text);
206 }
207 }
208 }
209 else {
210 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
211
212 # its a metadata element
213 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
214 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
215 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
216 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
217 }
218 }
219 push (@metadata_list, @section_metadata);
220 }
221 } # for each field in this one index
222
223
224 # now we add the text and/or metadata into new_text
225 if ($section_text ne "" || scalar(@metadata_list)) {
226 my $new_text = "";
227
228 if ($section_text ne "") {
229 $new_text .= "$section_text ";
230 }
231
232 foreach my $item (@metadata_list) {
233 &ghtml::htmlsafe($item);
234 $new_text .= "$item ";
235 }
236
237 if ($allfields_index) {
238 $allfields_text .= $new_text;
239 }
240
241 if ($self->{'indexing_text'}) {
242 # add the tag
243 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
244 $self->{'allindexfields'}->{$real_field} = 1;
245 }
246 # filter the text
247 $new_text = $self->filter_text ($field, $new_text);
248
249 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
250 $self->{'num_processed_bytes'} += length ($new_text);
251 $text .= "$new_text";
252 }
253 else {
254 # delete
255 $self->{'num_processed_bytes'} -= length ($new_text);
256 }
257 }
258
259 } # foreach field
260
261 if ($all_metadata_specified) {
262
263 my $new_text = "";
264 my $shortname = "";
265 my $metadata = $doc_obj->get_all_metadata ($section);
266 foreach my $pair (@$metadata) {
267 my ($mfield, $mvalue) = (@$pair);
268 # no value
269 next unless defined $mvalue && $mvalue ne "";
270 # we have already indexed this
271 next if defined ($specified_fields->{$mfield});
272 # check fields here, maybe others dont want - change to use dontindex!!
273 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
274 next if ($mfield =~ /^gsdl/);
275
276 &ghtml::htmlsafe($mvalue);
277
278 if (defined $self->{'fieldnamemap'}->{$mfield}) {
279 $shortname = $self->{'fieldnamemap'}->{$mfield};
280 }
281 else {
282 $shortname = $self->create_shortname($mfield);
283 $self->{'fieldnamemap'}->{$mfield} = $shortname;
284 $self->{'fieldnamemap'}->{$shortname} = 1;
285 }
286 $self->{'allindexfields'}->{$mfield} = 1;
287 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
288 if ($allfields_index) {
289 $allfields_text .= "$mvalue ";
290 }
291
292 if (!defined $self->{'extraindexfields'}->{$mfield}) {
293 $self->{'extraindexfields'}->{$mfield} = 1;
294 }
295
296 }
297 # filter the text
298 $new_text = $self->filter_text ("metadata", $new_text);
299
300 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
301 $self->{'num_processed_bytes'} += length ($new_text);
302 $text .= "$new_text";
303 }
304 else {
305 # delete
306 $self->{'num_processed_bytes'} -= length ($new_text);
307 }
308 }
309
310 if ($allfields_index) {
311
312 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
313 # filter the text
314 $new_text = $self->filter_text ("allfields", $new_text);
315
316 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
317 $self->{'num_processed_bytes'} += length ($new_text);
318 $text .= "$new_text";
319 }
320 else {
321 # delete
322 $self->{'num_processed_bytes'} -= length ($new_text);
323 }
324 }
325
326 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
327
328 $section = $doc_obj->get_next_section($section);
329 } # while defined section
330
331 print $lucenehandle "$text\n$documentendtag";
332
333## if ($edit_mode eq "delete") {
334## print STDERR "$text\n$documentendtag";
335## }
336
337}
338
339sub text {
340 my $self = shift (@_);
341 my ($doc_obj,$file) = @_;
342
343 $self->textedit($doc_obj,$file,"add");
344}
345
346sub textreindex
347{
348 my $self = shift (@_);
349 my ($doc_obj,$file) = @_;
350
351 $self->textedit($doc_obj,$file,"update");
352}
353
354sub textdelete
355{
356 my $self = shift (@_);
357 my ($doc_obj,$file) = @_;
358
359 $self->textedit($doc_obj,$file,"delete");
360}
361
362
363
364
365
366# /** We make this builder pretend to be a document processor so we can get
367# * information back from the plugins.
368# *
369# * @param $self A reference to this Lucene builder
370# * @param $doc_obj A reference to a document object representing what was
371# * parsed by the GAPlug
372# * @param $file The name of the file parsed as a string
373# *
374# * @author John Thompson, DL Consulting Ltd
375# */
376sub process()
377 {
378 my $self = shift (@_);
379 my ($doc_obj, $file) = @_;
380
381 # If this is called from any stage other than an incremental infodb we want
382 # to pass through to the superclass of build
383 if ($self->get_mode() eq "incinfodb")
384 {
385 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
386 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
387 $archivedir = "" unless defined $archivedir;
388 $archivedir =~ s/\\/\//g;
389 $archivedir =~ s/^\/+//;
390 $archivedir =~ s/\/+$//;
391
392 # Number of files
393 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
394
395 # resolve the final filenames of the files associated with this document
396 $self->assoc_files ($doc_obj, $archivedir);
397
398 # is this a paged or a hierarchical document
399 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
400
401 # Determine the actual docnum by checking if we've processed any
402 # previous incrementally added documents. If so, carry on from there.
403 # Otherwise we set the counter to be the same as the number of
404 # sections encountered during the previous build
405 if ($self->{'numincdocs'} == 0)
406 {
407 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
408 }
409
410 my $section = $doc_obj->get_top_section ();
411 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
412 my $doc_OID = $doc_obj->get_OID();
413 my $url = "";
414 while (defined $section)
415 {
416 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
417 # Attach all the other metadata to this document
418 # output the fact that this document is a document (unless doctype
419 # has been set to something else from within a plugin
420 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
421 if (!defined $dtype || $dtype !~ /\w/)
422 {
423 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
424 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
425 }
426 # output whether this node contains text
427 if ($doc_obj->get_text_length($section) > 0)
428 {
429 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
430 }
431 else
432 {
433 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
434 }
435
436 # output archivedir if at top level
437 if ($section eq $doc_obj->get_top_section())
438 {
439 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
440 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
441 }
442
443 # output a list of children
444 my $children = $doc_obj->get_children ($section);
445 if (scalar(@$children) > 0)
446 {
447 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
448 my @contains = ();
449 foreach my $child (@$children)
450 {
451 if ($child =~ /^.*?\.(\d+)$/)
452 {
453 push (@contains, "\".$1");
454 }
455 else
456 {
457 push (@contains, "\".$child");
458 }
459 }
460 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
461 }
462 #output the matching doc number
463 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
464 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
465
466 $self->{'numincdocs'}++;
467 $section = $doc_obj->get_next_section($section);
468 # if no sections wanted, only add the docs
469 last if ($self->{'db_level'} eq "document");
470 }
471 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
472 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
473 }
474 else
475 {
476 $self->mgppbuildproc::process(@_);
477 }
478 }
479# /** process() **/
480
481
482# Following methods seem to be no different to those defined in basebuildproc.pm
483# From inspection, it looks like these ones can be removed
484
485
486sub get_num_docs {
487 my $self = shift (@_);
488 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
489 return $self->{'num_docs'};
490}
491
492sub get_num_sections {
493 my $self = shift (@_);
494 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
495 return $self->{'num_sections'};
496}
497
498# num_bytes is the actual number of bytes in the collection
499# this is normally the same as what's processed during text compression
500sub get_num_bytes {
501 my $self = shift (@_);
502 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
503 return $self->{'num_bytes'};
504}
505
506
507# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
508# Otherwise the removal of tags below might lead to Lucene turning
509# "...farming</p>\n<p>EDWARD.." into "farmingedward"
510# (example from demo collection b20cre)
511# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
512sub preprocess_text
513{
514 my $self = shift (@_);
515 my ($text, $strip_html, $para) = @_;
516 # at this stage, we do not do paragraph tags unless have strip_html -
517 # it will result in a huge mess of non-xml
518 return unless $strip_html;
519
520 my $new_text = $text;
521
522 # if we have <pre> tags, we can have < > inside them, need to delete
523 # the <> before stripping tags
524 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
525
526 if ($para eq "") {
527 # just remove all tags
528 $new_text =~ s/<[^>]*>/ /gs;
529 } else {
530 # strip all tags except <p> tags which get turned into $para
531 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
532 }
533
534 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
535 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
536 $new_text =~ s/&\w{1,10};//g;
537 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
538 $new_text =~ s/&([^\#])/ $1/g;
539
540 return $new_text;
541}
542
543sub delete_assoc_files
544{
545 my $self = shift (@_);
546 my ($archivedir, $edit_mode) = @_;
547
548 $self->basebuildproc::delete_assoc_files(@_);
549
550 if ($edit_mode eq "delete") {
551 # if we are deleting the doc, then also delete the lucene text version
552 my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
553 if (-d $assoc_dir) {
554 &util::rm_r($assoc_dir);
555 }
556 }
557}
5581;
559
560
Note: See TracBrowser for help on using the repository browser.