source: gsdl/trunk/perllib/lucenebuildproc.pm@ 20419

Last change on this file since 20419 was 20419, checked in by kjdon, 15 years ago

strip off ex. before retrieving metadata for indexing. ex. now valid in collect.cfg

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub textedit {
66 my $self = shift (@_);
67 my ($doc_obj,$file,$edit_mode) = @_;
68
69 my $lucenehandle = $self->{'output_handle'};
70 my $outhandle = $self->{'outhandle'};
71
72 # only output this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 # skip this document if in "compress-text" mode and asked to delete it
76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80 # this is another document
81 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
82 $self->{'num_docs'} += 1;
83 }
84 else {
85 $self->{'num_docs'} -= 1;
86 }
87
88 # get the parameters for the output
89 # split on : just in case there is subcoll and lang stuff
90 my ($fields) = split (/:/, $self->{'index'});
91
92 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94 my $levels = $self->{'levels'};
95 my $ldoc_level = $levels->{'document'};
96 my $lsec_level = $levels->{'section'};
97
98 # gs2_id should be depricated #####
99 my $gs2_id = "";
100 if ($ldoc_level)
101 {
102 if ($self->{'db_level'} eq 'document')
103 {
104 $gs2_id = $self->{'num_docs'};
105 }
106 else
107 {
108 # default is section level
109 $gs2_id = $self->{'num_sections'} + 1;
110 }
111 }
112 my $gs2_docOID = $doc_obj->get_OID();
113 my $documenttag = undef;
114 my $documentendtag = undef;
115
116 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117 $documentendtag = "\n</$doc_tag_name>\n";
118
119 my $sec_tag_name = "";
120 if ($lsec_level)
121 {
122 $sec_tag_name = $mgppbuildproc::level_map{'section'};
123 }
124
125 my $doc_section = 0; # just for this document
126
127 my $text = "";
128 $text .= $documenttag;
129 # get the text for this document
130 my $section = $doc_obj->get_top_section();
131 while (defined $section)
132 {
133 # update a few statistics
134 $doc_section++;
135 $self->{'num_sections'}++;
136
137 my $sec_gs2_id = $self->{'num_sections'};
138 my $sec_gs2_docOID = $gs2_docOID;
139 $sec_gs2_docOID .= ".$section" if ($section ne "");
140
141 # if we are doing subcollections, then some docs shouldn't be indexed.
142 # but we need to put the section tag placeholders in there so the
143 # sections match up with database
144 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
145 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
146 if ($sec_tag_name ne "") {
147 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
148 $text .= "\n</$sec_tag_name>\n"
149 }
150 $section = $doc_obj->get_next_section($section);
151 next;
152 }
153
154 if ($sec_tag_name ne "")
155 {
156 $text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
157 }
158
159 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
160 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
161 }
162 else {
163 # delete
164 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
165 }
166
167
168 # has the user added a 'metadata' index?
169 my $all_metadata_specified = 0;
170 # which fields have already been indexed? (same as fields, but in a map)
171 my $specified_fields = {};
172
173 # do we have an allfields index??
174 my $allfields_index = 0;
175 # collect up all the text for it in here
176 my $allfields_text = "";
177 foreach my $field (split (/;/, $fields)) {
178 if ($field eq "allfields") {
179 $allfields_index = 1;
180 } elsif ($field eq "metadata") {
181 $all_metadata_specified = 1;
182 }
183 }
184
185 foreach my $field (split (/;/, $fields)) {
186
187 # only deal with this field if it doesn't start with top or
188 # this is the first section
189 my $real_field = $field;
190 next if (($real_field =~ s/^top//) && ($doc_section != 1));
191
192 # process these two later
193 next if ($real_field eq "allfields" || $real_field eq "metadata");
194
195 #individual metadata and or text specified - could be a comma separated list
196 $specified_fields->{$real_field} = 1;
197 my $shortname="";
198 my $new_field = 0; # have we found a new field name?
199 if (defined $self->{'indexfieldmap'}->{$real_field}) {
200 $shortname = $self->{'indexfieldmap'}->{$real_field};
201 }
202 else {
203 $shortname = $self->create_shortname($real_field);
204 $new_field = 1;
205 }
206
207 my @metadata_list = (); # put any metadata values in here
208 my $section_text = ""; # put the text in here
209 foreach my $submeta (split /,/, $real_field) {
210 if ($submeta eq "text") {
211 # no point in indexing text more than once
212 if ($section_text eq "") {
213 $section_text = $doc_obj->get_text($section);
214 if ($self->{'indexing_text'}) {
215 # we always strip html
216 $section_text = $self->preprocess_text($section_text, 1, "");
217 }
218 else {
219 # leave html stuff in, but escape the tags
220 &ghtml::htmlsafe($section_text);
221 }
222 }
223 }
224 else {
225 $submeta =~ s/^ex\.//; #strip off ex.
226
227 # its a metadata element
228 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
229 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
230 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
231 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
232 }
233 }
234 push (@metadata_list, @section_metadata);
235 }
236 } # for each field in this one index
237
238 # now we add the text and/or metadata into new_text
239 if ($section_text ne "" || scalar(@metadata_list)) {
240 my $new_text = "";
241
242 if ($section_text ne "") {
243 $new_text .= "$section_text ";
244 }
245
246 foreach my $item (@metadata_list) {
247 &ghtml::htmlsafe($item);
248 $new_text .= "$item ";
249 }
250
251 if ($allfields_index) {
252 $allfields_text .= $new_text;
253 }
254
255 if ($self->{'indexing_text'}) {
256 # add the tag
257 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
258 }
259 # filter the text
260 $new_text = $self->filter_text ($field, $new_text);
261
262 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
263 $self->{'num_processed_bytes'} += length ($new_text);
264 $text .= "$new_text";
265 }
266 else {
267 # delete
268 $self->{'num_processed_bytes'} -= length ($new_text);
269 }
270
271
272 if ($self->{'indexing_text'} && $new_field) {
273 # we need to add to the list in indexfields
274
275 $self->{'indexfieldmap'}->{$real_field} = $shortname;
276 $self->{'indexfieldmap'}->{$shortname} = 1;
277 }
278
279 }
280
281 } # foreach field
282
283
284 if ($all_metadata_specified) {
285
286 my $new_text = "";
287 my $shortname = "";
288 my $metadata = $doc_obj->get_all_metadata ($section);
289 foreach my $pair (@$metadata) {
290 my ($mfield, $mvalue) = (@$pair);
291 # no value
292 next unless defined $mvalue && $mvalue ne "";
293 # we have already indexed this
294 next if defined ($specified_fields->{$mfield});
295 # check fields here, maybe others dont want - change to use dontindex!!
296 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
297 next if ($mfield =~ /^gsdl/);
298
299 &ghtml::htmlsafe($mvalue);
300
301 if (defined $self->{'indexfieldmap'}->{$mfield}) {
302 $shortname = $self->{'indexfieldmap'}->{$mfield};
303 }
304 else {
305 $shortname = $self->create_shortname($mfield);
306 $self->{'indexfieldmap'}->{$mfield} = $shortname;
307 $self->{'indexfieldmap'}->{$shortname} = 1;
308 }
309 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
310 if ($allfields_index) {
311 $allfields_text .= "$mvalue ";
312 }
313
314 if (!defined $self->{'indexfields'}->{$mfield}) {
315 $self->{'indexfields'}->{$mfield} = 1;
316 }
317
318 }
319 # filter the text
320 $new_text = $self->filter_text ("metadata", $new_text);
321
322 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
323 $self->{'num_processed_bytes'} += length ($new_text);
324 $text .= "$new_text";
325 }
326 else {
327 # delete
328 $self->{'num_processed_bytes'} -= length ($new_text);
329 }
330 }
331
332 if ($allfields_index) {
333 # add the index name mapping
334 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
335 $self->{'indexfieldmap'}->{"ZZ"} = 1;
336
337 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
338 # filter the text
339 $new_text = $self->filter_text ("allfields", $new_text);
340
341 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
342 $self->{'num_processed_bytes'} += length ($new_text);
343 $text .= "$new_text";
344 }
345 else {
346 # delete
347 $self->{'num_processed_bytes'} -= length ($new_text);
348 }
349 }
350
351 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
352
353 $section = $doc_obj->get_next_section($section);
354 } # while defined section
355
356 print $lucenehandle "$text\n$documentendtag";
357
358## if ($edit_mode eq "delete") {
359## print STDERR "$text\n$documentendtag";
360## }
361
362}
363
364sub text {
365 my $self = shift (@_);
366 my ($doc_obj,$file) = @_;
367
368 $self->textedit($doc_obj,$file,"add");
369}
370
371sub textreindex
372{
373 my $self = shift (@_);
374 my ($doc_obj,$file) = @_;
375
376 $self->textedit($doc_obj,$file,"update");
377}
378
379sub textdelete
380{
381 my $self = shift (@_);
382 my ($doc_obj,$file) = @_;
383
384 $self->textedit($doc_obj,$file,"delete");
385}
386
387
388
389
390
391# /** We make this builder pretend to be a document processor so we can get
392# * information back from the plugins.
393# *
394# * @param $self A reference to this Lucene builder
395# * @param $doc_obj A reference to a document object representing what was
396# * parsed by the GAPlug
397# * @param $file The name of the file parsed as a string
398# *
399# * @author John Thompson, DL Consulting Ltd
400# */
401sub process()
402 {
403 my $self = shift (@_);
404 my ($doc_obj, $file) = @_;
405
406 # If this is called from any stage other than an incremental infodb we want
407 # to pass through to the superclass of build
408 if ($self->get_mode() eq "incinfodb")
409 {
410 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
411 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
412 $archivedir = "" unless defined $archivedir;
413 $archivedir =~ s/\\/\//g;
414 $archivedir =~ s/^\/+//;
415 $archivedir =~ s/\/+$//;
416
417 # Number of files
418 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
419
420 # resolve the final filenames of the files associated with this document
421 $self->assoc_files ($doc_obj, $archivedir);
422
423 # is this a paged or a hierarchical document
424 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
425
426 # Determine the actual docnum by checking if we've processed any
427 # previous incrementally added documents. If so, carry on from there.
428 # Otherwise we set the counter to be the same as the number of
429 # sections encountered during the previous build
430 if ($self->{'numincdocs'} == 0)
431 {
432 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
433 }
434
435 my $section = $doc_obj->get_top_section ();
436 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
437 my $doc_OID = $doc_obj->get_OID();
438 my $url = "";
439 while (defined $section)
440 {
441 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
442 # Attach all the other metadata to this document
443 # output the fact that this document is a document (unless doctype
444 # has been set to something else from within a plugin
445 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
446 if (!defined $dtype || $dtype !~ /\w/)
447 {
448 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
449 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
450 }
451 # output whether this node contains text
452 if ($doc_obj->get_text_length($section) > 0)
453 {
454 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
455 }
456 else
457 {
458 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
459 }
460
461 # output archivedir if at top level
462 if ($section eq $doc_obj->get_top_section())
463 {
464 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
465 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
466 }
467
468 # output a list of children
469 my $children = $doc_obj->get_children ($section);
470 if (scalar(@$children) > 0)
471 {
472 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
473 my @contains = ();
474 foreach my $child (@$children)
475 {
476 if ($child =~ /^.*?\.(\d+)$/)
477 {
478 push (@contains, "\".$1");
479 }
480 else
481 {
482 push (@contains, "\".$child");
483 }
484 }
485 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
486 }
487 #output the matching doc number
488 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
489 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
490
491 $self->{'numincdocs'}++;
492 $section = $doc_obj->get_next_section($section);
493 # if no sections wanted, only add the docs
494 last if ($self->{'db_level'} eq "document");
495 }
496 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
497 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
498 }
499 else
500 {
501 $self->mgppbuildproc::process(@_);
502 }
503 }
504# /** process() **/
505
506
507# Following methods seem to be no different to those defined in basebuildproc.pm
508# From inspection, it looks like these ones can be removed
509
510
511sub get_num_docs {
512 my $self = shift (@_);
513 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
514 return $self->{'num_docs'};
515}
516
517sub get_num_sections {
518 my $self = shift (@_);
519 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
520 return $self->{'num_sections'};
521}
522
523# num_bytes is the actual number of bytes in the collection
524# this is normally the same as what's processed during text compression
525sub get_num_bytes {
526 my $self = shift (@_);
527 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
528 return $self->{'num_bytes'};
529}
530
531
532# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
533# Otherwise the removal of tags below might lead to Lucene turning
534# "...farming</p>\n<p>EDWARD.." into "farmingedward"
535# (example from demo collection b20cre)
536# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
537sub preprocess_text
538{
539 my $self = shift (@_);
540 my ($text, $strip_html, $para) = @_;
541 # at this stage, we do not do paragraph tags unless have strip_html -
542 # it will result in a huge mess of non-xml
543 return unless $strip_html;
544
545 my $new_text = $text;
546
547 # if we have <pre> tags, we can have < > inside them, need to delete
548 # the <> before stripping tags
549 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
550
551 if ($para eq "") {
552 # just remove all tags
553 $new_text =~ s/<[^>]*>/ /gs;
554 } else {
555 # strip all tags except <p> tags which get turned into $para
556 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
557 }
558
559 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
560 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
561 $new_text =~ s/&\w{1,10};//g;
562 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
563 $new_text =~ s/&([^\#])/ $1/g;
564
565 return $new_text;
566}
567
568
5691;
570
571
Note: See TracBrowser for help on using the repository browser.