source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 27358

Last change on this file since 27358 was 27358, checked in by kjdon, 11 years ago

indexing sortfields separately. tidy up the parsing of the indexes list - only work out shortnames etc once, not every document and every section. Note, need to do this for sort fields too

  • Property svn:keywords set to Author Date Id Revision
File size: 20.4 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51 $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
52 $self->{'allfields_index'} = 0; # do we need allfields index?
53 $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
54 $self->{'actualsortfields'} = {}; # sort fields that have actually been used
55 $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
56 return bless $self, $class;
57}
58
59sub set_index {
60 my $self = shift (@_);
61 my ($index, $indexexparr) = @_;
62
63 $self->mgppbuildproc::set_index($index, $indexexparr);
64
65 # just get the list of index fields without any subcoll stuff
66 my ($fields) = split (/:/, $self->{'index'});
67
68 foreach my $field (split (/;/, $fields)) {
69 if ($field eq "allfields") {
70 $self->{'allfields_index'} = 1;
71 } elsif ($field eq "metadata") {
72 $self->{'all_metadata_specified'} = 1;
73 } else {
74 $field =~ s/^top//;
75 $self->{'specified_fields'} ->{$field} = 1;
76 }
77 }
78}
79
80sub set_sortfields {
81 my $self = shift (@_);
82
83 ($self->{'sortfields'}) = @_;
84}
85
86sub is_incremental_capable
87{
88 my $self = shift (@_);
89
90 # Unlike MG and MGPP, Lucene supports incremental building
91 return 1;
92}
93
94
95sub textedit {
96 my $self = shift (@_);
97 my ($doc_obj,$file,$edit_mode) = @_;
98
99 my $lucenehandle = $self->{'output_handle'};
100 my $outhandle = $self->{'outhandle'};
101
102 # only output this document if it is one to be indexed
103 return if ($doc_obj->get_doc_type() ne "indexed_doc");
104
105 # skip this document if in "compress-text" mode and asked to delete it
106 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
107
108 # 0/1 to indicate whether this doc is part of the specified subcollection
109 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
110
111 # this is another document
112 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
113 $self->{'num_docs'} += 1;
114 }
115 else {
116 $self->{'num_docs'} -= 1;
117 }
118
119
120 # get the parameters for the output
121 # split on : just in case there is subcoll and lang stuff
122 my ($fields) = split (/:/, $self->{'index'});
123
124 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
125
126 my $levels = $self->{'levels'};
127 my $ldoc_level = $levels->{'document'};
128 my $lsec_level = $levels->{'section'};
129
130 my $gs2_docOID = $doc_obj->get_OID();
131 my $documenttag = undef;
132 my $documentendtag = undef;
133
134 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
135 $documentendtag = "\n</$doc_tag_name>\n";
136
137 my $sec_tag_name = "";
138 if ($lsec_level)
139 {
140 $sec_tag_name = $mgppbuildproc::level_map{'section'};
141 }
142
143 my $doc_section = 0; # just for this document
144
145 my $text = "";
146 $text .= $documenttag;
147 # get the text for this document
148 my $section = $doc_obj->get_top_section();
149 while (defined $section)
150 {
151 # update a few statistics
152 $doc_section++;
153 $self->{'num_sections'}++;
154
155 my $sec_gs2_id = $self->{'num_sections'};
156 my $sec_gs2_docOID = $gs2_docOID;
157 $sec_gs2_docOID .= ".$section" if ($section ne "");
158
159 # if we are doing subcollections, then some docs shouldn't be indexed.
160 # but we need to put the section tag placeholders in there so the
161 # sections match up with database
162 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
163 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
164 if ($sec_tag_name ne "") {
165 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
166 $text .= "\n</$sec_tag_name>\n"
167 }
168 $section = $doc_obj->get_next_section($section);
169 next;
170 }
171
172 if ($sec_tag_name ne "")
173 {
174 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
175 }
176
177 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
178 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
179 }
180 else {
181 # delete
182 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
183 }
184
185
186 # collect up all the text for allfields index in here (if there is one)
187 my $allfields_text = "";
188
189 foreach my $field (split (/;/, $fields)) {
190
191 # only deal with this field if it doesn't start with top or
192 # this is the first section
193 my $real_field = $field;
194 next if (($real_field =~ s/^top//) && ($doc_section != 1));
195
196 # process these two later
197 next if ($real_field eq "allfields" || $real_field eq "metadata");
198
199 #individual metadata and or text specified - could be a comma separated list
200 #$specified_fields->{$real_field} = 1;
201 my $shortname="";
202 my $new_field = 0; # have we found a new field name?
203 if (defined $self->{'fieldnamemap'}->{$real_field}) {
204 $shortname = $self->{'fieldnamemap'}->{$real_field};
205 } else {
206 $shortname = $self->create_shortname($real_field);
207 $self->{'fieldnamemap'}->{$real_field} = $shortname;
208 $self->{'fieldnamemap'}->{$shortname} = 1;
209 }
210 my @metadata_list = (); # put any metadata values in here
211 my $section_text = ""; # put the text in here
212 foreach my $submeta (split /,/, $real_field) {
213 if ($submeta eq "text") {
214 # no point in indexing text more than once
215 if ($section_text eq "") {
216 $section_text = $doc_obj->get_text($section);
217 if ($self->{'indexing_text'}) {
218 # we always strip html
219 $section_text = $self->preprocess_text($section_text, 1, "");
220 }
221 else {
222 # leave html stuff in, but escape the tags
223 &ghtml::htmlsafe($section_text);
224 }
225 }
226 }
227 else {
228 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
229
230 # its a metadata element
231 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
232 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
233 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
234 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
235 }
236 }
237 push (@metadata_list, @section_metadata);
238 }
239 } # for each field in this one index
240
241
242 # now we add the text and/or metadata into new_text
243 if ($section_text ne "" || scalar(@metadata_list)) {
244 my $new_text = "";
245
246 if ($section_text ne "") {
247 $new_text .= "$section_text ";
248 }
249
250 foreach my $item (@metadata_list) {
251 &ghtml::htmlsafe($item);
252 $new_text .= "$item ";
253 }
254
255 if ($self->{'allfields_index'}) {
256 $allfields_text .= $new_text;
257 }
258
259 if ($self->{'indexing_text'}) {
260 # add the tag
261 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
262 $self->{'allindexfields'}->{$real_field} = 1;
263 }
264 # filter the text
265 $new_text = $self->filter_text ($field, $new_text);
266
267 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
268 $self->{'num_processed_bytes'} += length ($new_text);
269 $text .= "$new_text";
270 }
271 else {
272 # delete
273 $self->{'num_processed_bytes'} -= length ($new_text);
274 }
275 }
276
277 } # foreach field
278
279 if ($self->{'all_metadata_specified'}) {
280
281 my $new_text = "";
282 my $shortname = "";
283 my $metadata = $doc_obj->get_all_metadata ($section);
284 foreach my $pair (@$metadata) {
285 my ($mfield, $mvalue) = (@$pair);
286 # no value
287 next unless defined $mvalue && $mvalue ne "";
288 # we have already indexed this
289 next if defined ($self->{'specified_fields'}->{$mfield});
290 # check fields here, maybe others dont want - change to use dontindex!!
291 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
292 next if ($mfield =~ /^gsdl/);
293
294 &ghtml::htmlsafe($mvalue);
295
296 if (defined $self->{'fieldnamemap'}->{$mfield}) {
297 $shortname = $self->{'fieldnamemap'}->{$mfield};
298 }
299 else {
300 $shortname = $self->create_shortname($mfield);
301 $self->{'fieldnamemap'}->{$mfield} = $shortname;
302 $self->{'fieldnamemap'}->{$shortname} = 1;
303 }
304 $self->{'allindexfields'}->{$mfield} = 1;
305 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
306 if ($self->{'allfields_index'}) {
307 $allfields_text .= "$mvalue ";
308 }
309
310 if (!defined $self->{'extraindexfields'}->{$mfield}) {
311 $self->{'extraindexfields'}->{$mfield} = 1;
312 }
313
314 }
315 # filter the text
316 $new_text = $self->filter_text ("metadata", $new_text);
317
318 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
319 $self->{'num_processed_bytes'} += length ($new_text);
320 $text .= "$new_text";
321 }
322 else {
323 # delete
324 $self->{'num_processed_bytes'} -= length ($new_text);
325 }
326 }
327
328 if ($self->{'allfields_index'}) {
329
330 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
331 # filter the text
332 $new_text = $self->filter_text ("allfields", $new_text);
333
334 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
335 $self->{'num_processed_bytes'} += length ($new_text);
336 $text .= "$new_text";
337 }
338 else {
339 # delete
340 $self->{'num_processed_bytes'} -= length ($new_text);
341 }
342 }
343 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
344 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
345 # add sort fields if there are any
346
347 foreach my $sfield (@{$self->{'sortfields'}}) {
348 my $sf_shortname;
349 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
350 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
351 }
352 else {
353 $sf_shortname = $self->create_sortfield_shortname($sfield);
354 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
355 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
356 }
357 my @metadata_list = (); # put any metadata values in here
358 foreach my $submeta (split /,/, $sfield) {
359 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
360
361 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
362 push (@metadata_list, @section_metadata);
363 }
364 my $new_text = "";
365 foreach my $item (@metadata_list) {
366 &ghtml::htmlsafe($item);
367 $new_text .= "$item ";
368 }
369 if ($new_text =~ /\S/) {
370 $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
371 # filter the text???
372 $text .= "$new_text"; # add it to the main text block
373 $self->{'actualsortfields'}->{$sfield} = 1;
374 }
375 }
376 }
377 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
378
379 $section = $doc_obj->get_next_section($section);
380 } # for each section
381
382 #open (TEXTOUT, ">text.out");
383 #print TEXTOUT "$text\n$documentendtag";
384 #close TEXTOUT;
385
386 print $lucenehandle "$text\n$documentendtag";
387
388## if ($edit_mode eq "delete") {
389## print STDERR "$text\n$documentendtag";
390## }
391
392}
393
394sub text {
395 my $self = shift (@_);
396 my ($doc_obj,$file) = @_;
397
398 $self->textedit($doc_obj,$file,"add");
399}
400
401sub textreindex
402{
403 my $self = shift (@_);
404 my ($doc_obj,$file) = @_;
405
406 $self->textedit($doc_obj,$file,"update");
407}
408
409sub textdelete
410{
411 my $self = shift (@_);
412 my ($doc_obj,$file) = @_;
413
414 $self->textedit($doc_obj,$file,"delete");
415}
416
417
418
419
420
421# /** We make this builder pretend to be a document processor so we can get
422# * information back from the plugins.
423# *
424# * @param $self A reference to this Lucene builder
425# * @param $doc_obj A reference to a document object representing what was
426# * parsed by the GAPlug
427# * @param $file The name of the file parsed as a string
428# *
429# * @author John Thompson, DL Consulting Ltd
430# */
431sub process()
432 {
433 my $self = shift (@_);
434 my ($doc_obj, $file) = @_;
435
436 # If this is called from any stage other than an incremental infodb we want
437 # to pass through to the superclass of build
438 if ($self->get_mode() eq "incinfodb")
439 {
440 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
441 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
442 $archivedir = "" unless defined $archivedir;
443 $archivedir =~ s/\\/\//g;
444 $archivedir =~ s/^\/+//;
445 $archivedir =~ s/\/+$//;
446
447 # Number of files
448 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
449
450 # resolve the final filenames of the files associated with this document
451 $self->assoc_files ($doc_obj, $archivedir);
452
453 # is this a paged or a hierarchical document
454 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
455
456 # Determine the actual docnum by checking if we've processed any
457 # previous incrementally added documents. If so, carry on from there.
458 # Otherwise we set the counter to be the same as the number of
459 # sections encountered during the previous build
460 if ($self->{'numincdocs'} == 0)
461 {
462 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
463 }
464
465 my $section = $doc_obj->get_top_section ();
466 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
467 my $doc_OID = $doc_obj->get_OID();
468 my $url = "";
469 while (defined $section)
470 {
471 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
472 # Attach all the other metadata to this document
473 # output the fact that this document is a document (unless doctype
474 # has been set to something else from within a plugin
475 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
476 if (!defined $dtype || $dtype !~ /\w/)
477 {
478 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
479 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
480 }
481 # output whether this node contains text
482 if ($doc_obj->get_text_length($section) > 0)
483 {
484 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
485 }
486 else
487 {
488 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
489 }
490
491 # output archivedir if at top level
492 if ($section eq $doc_obj->get_top_section())
493 {
494 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
495 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
496 }
497
498 # output a list of children
499 my $children = $doc_obj->get_children ($section);
500 if (scalar(@$children) > 0)
501 {
502 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
503 my @contains = ();
504 foreach my $child (@$children)
505 {
506 if ($child =~ /^.*?\.(\d+)$/)
507 {
508 push (@contains, "\".$1");
509 }
510 else
511 {
512 push (@contains, "\".$child");
513 }
514 }
515 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
516 }
517 #output the matching doc number
518 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
519 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
520
521 $self->{'numincdocs'}++;
522 $section = $doc_obj->get_next_section($section);
523 # if no sections wanted, only add the docs
524 last if ($self->{'db_level'} eq "document");
525 }
526 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
527 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
528 }
529 else
530 {
531 $self->mgppbuildproc::process(@_);
532 }
533 }
534# /** process() **/
535
536
537# Following methods seem to be no different to those defined in basebuildproc.pm
538# From inspection, it looks like these ones can be removed
539
540
541sub get_num_docs {
542 my $self = shift (@_);
543 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
544 return $self->{'num_docs'};
545}
546
547sub get_num_sections {
548 my $self = shift (@_);
549 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
550 return $self->{'num_sections'};
551}
552
553# num_bytes is the actual number of bytes in the collection
554# this is normally the same as what's processed during text compression
555sub get_num_bytes {
556 my $self = shift (@_);
557 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
558 return $self->{'num_bytes'};
559}
560
561
562# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
563# Otherwise the removal of tags below might lead to Lucene turning
564# "...farming</p>\n<p>EDWARD.." into "farmingedward"
565# (example from demo collection b20cre)
566# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
567sub preprocess_text
568{
569 my $self = shift (@_);
570 my ($text, $strip_html, $para) = @_;
571 # at this stage, we do not do paragraph tags unless have strip_html -
572 # it will result in a huge mess of non-xml
573 return unless $strip_html;
574
575 my $new_text = $text;
576
577 # if we have <pre> tags, we can have < > inside them, need to delete
578 # the <> before stripping tags
579 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
580
581 if ($para eq "") {
582 # just remove all tags
583 $new_text =~ s/<[^>]*>/ /gs;
584 } else {
585 # strip all tags except <p> tags which get turned into $para
586 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
587 }
588
589 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
590 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
591 $new_text =~ s/&\w{1,10};//g;
592 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
593 $new_text =~ s/&([^\#])/ $1/g;
594
595 return $new_text;
596}
597
598sub delete_assoc_files
599{
600 my $self = shift (@_);
601 my ($archivedir, $edit_mode) = @_;
602
603 $self->basebuildproc::delete_assoc_files(@_);
604
605 if ($edit_mode eq "delete") {
606 # if we are deleting the doc, then also delete the lucene text version
607 my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
608 if (-d $assoc_dir) {
609 &util::rm_r($assoc_dir);
610 }
611 }
612}
613
614sub create_sortfield_shortname {
615 my $self = shift(@_);
616
617 my ($realname) = @_;
618
619 my $index_shortname;
620 # if we have created a shortname for an index on this field, then use it.
621 if (defined $self->{'fieldnamemap'}->{$realname}) {
622 $index_shortname = $self->{'fieldnamemap'}->{$realname};
623 } else {
624 $index_shortname = $self->create_shortname($realname);
625 }
626 return "by".$index_shortname;
627}
628
629
6301;
631
632
Note: See TracBrowser for help on using the repository browser.