source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 28355

Last change on this file since 28355 was 28035, checked in by kjdon, 11 years ago

handle sort field none as well as rank

  • Property svn:keywords set to Author Date Id Revision
File size: 21.3 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51 $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
52 $self->{'allfields_index'} = 0; # do we need allfields index?
53 $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
54 $self->{'actualsortfields'} = {}; # sort fields that have actually been used
55 $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
56 return bless $self, $class;
57}
58
59sub set_index {
60 my $self = shift (@_);
61 my ($index, $indexexparr) = @_;
62
63 $self->mgppbuildproc::set_index($index, $indexexparr);
64
65 # just get the list of index fields without any subcoll stuff
66 my ($fields) = split (/:/, $self->{'index'});
67
68 foreach my $field (split (/;/, $fields)) {
69 if ($field eq "allfields") {
70 $self->{'allfields_index'} = 1;
71 } elsif ($field eq "metadata") {
72 $self->{'all_metadata_specified'} = 1;
73 } else {
74 $field =~ s/^top//;
75 $self->{'specified_fields'} ->{$field} = 1;
76 }
77 }
78}
79
80sub set_sections_sort_on_document_metadata {
81 my $self= shift (@_);
82 my ($index_type) = @_;
83
84 $self->{'sections_sort_on_document_metadata'} = $index_type;
85}
86
87sub set_sortfields {
88 my $self = shift (@_);
89
90 my ($sortfields) = @_;
91 $self->{'sortfields'} = ();
92 # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
93 foreach my $s (@$sortfields) {
94 if ($s !~ /^(text|allfields|metadata)$/) {
95 push (@{$self->{'sortfields'}}, $s);
96 }
97 }
98}
99
100sub is_incremental_capable
101{
102 my $self = shift (@_);
103
104 # Unlike MG and MGPP, Lucene supports incremental building
105 return 1;
106}
107
108
109sub textedit {
110 my $self = shift (@_);
111 my ($doc_obj,$file,$edit_mode) = @_;
112
113 my $lucenehandle = $self->{'output_handle'};
114 my $outhandle = $self->{'outhandle'};
115
116 # only output this document if it is one to be indexed
117 return if ($doc_obj->get_doc_type() ne "indexed_doc");
118
119 # skip this document if in "compress-text" mode and asked to delete it
120 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
121
122 # 0/1 to indicate whether this doc is part of the specified subcollection
123 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
124
125 # this is another document
126 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
127 $self->{'num_docs'} += 1;
128 }
129 else {
130 $self->{'num_docs'} -= 1;
131 }
132
133
134 # get the parameters for the output
135 # split on : just in case there is subcoll and lang stuff
136 my ($fields) = split (/:/, $self->{'index'});
137
138 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
139
140 my $levels = $self->{'levels'};
141 my $ldoc_level = $levels->{'document'};
142 my $lsec_level = $levels->{'section'};
143
144 my $gs2_docOID = $doc_obj->get_OID();
145 my $documenttag = undef;
146 my $documentendtag = undef;
147
148 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
149 $documentendtag = "\n</$doc_tag_name>\n";
150
151 my $sec_tag_name = "";
152 if ($lsec_level)
153 {
154 $sec_tag_name = $mgppbuildproc::level_map{'section'};
155 }
156
157 my $doc_section = 0; # just for this document
158
159 my $text = "";
160 $text .= $documenttag;
161 # get the text for this document
162 my $section = $doc_obj->get_top_section();
163 while (defined $section)
164 {
165 # update a few statistics
166 $doc_section++;
167 $self->{'num_sections'}++;
168
169 my $sec_gs2_id = $self->{'num_sections'};
170 my $sec_gs2_docOID = $gs2_docOID;
171 $sec_gs2_docOID .= ".$section" if ($section ne "");
172
173 # if we are doing subcollections, then some docs shouldn't be indexed.
174 # but we need to put the section tag placeholders in there so the
175 # sections match up with database
176 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
177 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
178 if ($sec_tag_name ne "") {
179 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
180 $text .= "\n</$sec_tag_name>\n"
181 }
182 $section = $doc_obj->get_next_section($section);
183 next;
184 }
185
186 if ($sec_tag_name ne "")
187 {
188 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
189 }
190
191 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
192 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
193 }
194 else {
195 # delete
196 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
197 }
198
199
200 # collect up all the text for allfields index in here (if there is one)
201 my $allfields_text = "";
202
203 foreach my $field (split (/;/, $fields)) {
204
205 # only deal with this field if it doesn't start with top or
206 # this is the first section
207 my $real_field = $field;
208 next if (($real_field =~ s/^top//) && ($doc_section != 1));
209
210 # process these two later
211 next if ($real_field eq "allfields" || $real_field eq "metadata");
212
213 #individual metadata and or text specified - could be a comma separated list
214 #$specified_fields->{$real_field} = 1;
215 my $shortname="";
216 my $new_field = 0; # have we found a new field name?
217 if (defined $self->{'fieldnamemap'}->{$real_field}) {
218 $shortname = $self->{'fieldnamemap'}->{$real_field};
219 } else {
220 $shortname = $self->create_shortname($real_field);
221 $self->{'fieldnamemap'}->{$real_field} = $shortname;
222 $self->{'fieldnamemap'}->{$shortname} = 1;
223 }
224 my @metadata_list = (); # put any metadata values in here
225 my $section_text = ""; # put the text in here
226 foreach my $submeta (split /,/, $real_field) {
227 if ($submeta eq "text") {
228 # no point in indexing text more than once
229 if ($section_text eq "") {
230 $section_text = $doc_obj->get_text($section);
231 if ($self->{'indexing_text'}) {
232 # we always strip html
233 $section_text = $self->preprocess_text($section_text, 1, "");
234 }
235 else {
236 # leave html stuff in, but escape the tags
237 &ghtml::htmlsafe($section_text);
238 }
239 }
240 }
241 else {
242 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
243
244 # its a metadata element
245 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
246 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
247 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
248 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
249 }
250 }
251 push (@metadata_list, @section_metadata);
252 }
253 } # for each field in this one index
254
255
256 # now we add the text and/or metadata into new_text
257 if ($section_text ne "" || scalar(@metadata_list)) {
258 my $new_text = "";
259
260 if ($section_text ne "") {
261 $new_text .= "$section_text ";
262 }
263
264 foreach my $item (@metadata_list) {
265 &ghtml::htmlsafe($item);
266 $new_text .= "$item ";
267 }
268
269 if ($self->{'allfields_index'}) {
270 $allfields_text .= $new_text;
271 }
272
273 if ($self->{'indexing_text'}) {
274 # add the tag
275 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
276 $self->{'allindexfields'}->{$real_field} = 1;
277 }
278 # filter the text
279 $new_text = $self->filter_text ($field, $new_text);
280
281 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
282 $self->{'num_processed_bytes'} += length ($new_text);
283 $text .= "$new_text";
284 }
285 else {
286 # delete
287 $self->{'num_processed_bytes'} -= length ($new_text);
288 }
289 }
290
291 } # foreach field
292
293 if ($self->{'all_metadata_specified'}) {
294
295 my $new_text = "";
296 my $shortname = "";
297 my $metadata = $doc_obj->get_all_metadata ($section);
298 foreach my $pair (@$metadata) {
299 my ($mfield, $mvalue) = (@$pair);
300 # no value
301 next unless defined $mvalue && $mvalue ne "";
302 # we have already indexed this
303 next if defined ($self->{'specified_fields'}->{$mfield});
304 # check fields here, maybe others dont want - change to use dontindex!!
305 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
306 next if ($mfield =~ /^gsdl/);
307
308 &ghtml::htmlsafe($mvalue);
309
310 if (defined $self->{'fieldnamemap'}->{$mfield}) {
311 $shortname = $self->{'fieldnamemap'}->{$mfield};
312 }
313 else {
314 $shortname = $self->create_shortname($mfield);
315 $self->{'fieldnamemap'}->{$mfield} = $shortname;
316 $self->{'fieldnamemap'}->{$shortname} = 1;
317 }
318 $self->{'allindexfields'}->{$mfield} = 1;
319 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
320 if ($self->{'allfields_index'}) {
321 $allfields_text .= "$mvalue ";
322 }
323
324 if (!defined $self->{'extraindexfields'}->{$mfield}) {
325 $self->{'extraindexfields'}->{$mfield} = 1;
326 }
327
328 }
329 # filter the text
330 $new_text = $self->filter_text ("metadata", $new_text);
331
332 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
333 $self->{'num_processed_bytes'} += length ($new_text);
334 $text .= "$new_text";
335 }
336 else {
337 # delete
338 $self->{'num_processed_bytes'} -= length ($new_text);
339 }
340 }
341
342 if ($self->{'allfields_index'}) {
343
344 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
345 # filter the text
346 $new_text = $self->filter_text ("allfields", $new_text);
347
348 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
349 $self->{'num_processed_bytes'} += length ($new_text);
350 $text .= "$new_text";
351 }
352 else {
353 # delete
354 $self->{'num_processed_bytes'} -= length ($new_text);
355 }
356 }
357 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
358 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
359 # add sort fields if there are any
360
361 foreach my $sfield (@{$self->{'sortfields'}}) {
362 # ignore special field rank
363 next if ($sfield eq "rank" || $sfield eq "none");
364 my $sf_shortname;
365 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
366 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
367 }
368 else {
369 $sf_shortname = $self->create_sortfield_shortname($sfield);
370 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
371 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
372 }
373 my @metadata_list = (); # put any metadata values in here
374 foreach my $submeta (split /,/, $sfield) {
375 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
376
377 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
378 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
379 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
380 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
381 }
382 }
383 push (@metadata_list, @section_metadata);
384 }
385 my $new_text = "";
386 foreach my $item (@metadata_list) {
387 &ghtml::htmlsafe($item);
388 $new_text .= "$item";
389 }
390 if ($new_text =~ /\S/) {
391 $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
392 # filter the text???
393 $text .= "$new_text"; # add it to the main text block
394 $self->{'actualsortfields'}->{$sfield} = 1;
395 }
396 }
397 }
398 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
399
400 $section = $doc_obj->get_next_section($section);
401 } # for each section
402
403 #open (TEXTOUT, ">text.out");
404 #print TEXTOUT "$text\n$documentendtag";
405 #close TEXTOUT;
406
407 print $lucenehandle "$text\n$documentendtag";
408
409## if ($edit_mode eq "delete") {
410## print STDERR "$text\n$documentendtag";
411## }
412
413}
414
415sub text {
416 my $self = shift (@_);
417 my ($doc_obj,$file) = @_;
418
419 $self->textedit($doc_obj,$file,"add");
420}
421
422sub textreindex
423{
424 my $self = shift (@_);
425 my ($doc_obj,$file) = @_;
426
427 $self->textedit($doc_obj,$file,"update");
428}
429
430sub textdelete
431{
432 my $self = shift (@_);
433 my ($doc_obj,$file) = @_;
434
435 $self->textedit($doc_obj,$file,"delete");
436}
437
438
439
440
441
442# /** We make this builder pretend to be a document processor so we can get
443# * information back from the plugins.
444# *
445# * @param $self A reference to this Lucene builder
446# * @param $doc_obj A reference to a document object representing what was
447# * parsed by the GAPlug
448# * @param $file The name of the file parsed as a string
449# *
450# * @author John Thompson, DL Consulting Ltd
451# */
452sub process()
453 {
454 my $self = shift (@_);
455 my ($doc_obj, $file) = @_;
456
457 # If this is called from any stage other than an incremental infodb we want
458 # to pass through to the superclass of build
459 if ($self->get_mode() eq "incinfodb")
460 {
461 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
462 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
463 $archivedir = "" unless defined $archivedir;
464 $archivedir =~ s/\\/\//g;
465 $archivedir =~ s/^\/+//;
466 $archivedir =~ s/\/+$//;
467
468 # Number of files
469 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
470
471 # resolve the final filenames of the files associated with this document
472 $self->assoc_files ($doc_obj, $archivedir);
473
474 # is this a paged or a hierarchical document
475 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
476
477 # Determine the actual docnum by checking if we've processed any
478 # previous incrementally added documents. If so, carry on from there.
479 # Otherwise we set the counter to be the same as the number of
480 # sections encountered during the previous build
481 if ($self->{'numincdocs'} == 0)
482 {
483 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
484 }
485
486 my $section = $doc_obj->get_top_section ();
487 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
488 my $doc_OID = $doc_obj->get_OID();
489 my $url = "";
490 while (defined $section)
491 {
492 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
493 # Attach all the other metadata to this document
494 # output the fact that this document is a document (unless doctype
495 # has been set to something else from within a plugin
496 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
497 if (!defined $dtype || $dtype !~ /\w/)
498 {
499 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
500 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
501 }
502 # output whether this node contains text
503 if ($doc_obj->get_text_length($section) > 0)
504 {
505 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
506 }
507 else
508 {
509 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
510 }
511
512 # output archivedir if at top level
513 if ($section eq $doc_obj->get_top_section())
514 {
515 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
516 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
517 }
518
519 # output a list of children
520 my $children = $doc_obj->get_children ($section);
521 if (scalar(@$children) > 0)
522 {
523 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
524 my @contains = ();
525 foreach my $child (@$children)
526 {
527 if ($child =~ /^.*?\.(\d+)$/)
528 {
529 push (@contains, "\".$1");
530 }
531 else
532 {
533 push (@contains, "\".$child");
534 }
535 }
536 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
537 }
538 #output the matching doc number
539 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
540 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
541
542 $self->{'numincdocs'}++;
543 $section = $doc_obj->get_next_section($section);
544 # if no sections wanted, only add the docs
545 last if ($self->{'db_level'} eq "document");
546 }
547 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
548 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
549 }
550 else
551 {
552 $self->mgppbuildproc::process(@_);
553 }
554 }
555# /** process() **/
556
557
558# Following methods seem to be no different to those defined in basebuildproc.pm
559# From inspection, it looks like these ones can be removed
560
561
562sub get_num_docs {
563 my $self = shift (@_);
564 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
565 return $self->{'num_docs'};
566}
567
568sub get_num_sections {
569 my $self = shift (@_);
570 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
571 return $self->{'num_sections'};
572}
573
574# num_bytes is the actual number of bytes in the collection
575# this is normally the same as what's processed during text compression
576sub get_num_bytes {
577 my $self = shift (@_);
578 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
579 return $self->{'num_bytes'};
580}
581
582
583# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
584# Otherwise the removal of tags below might lead to Lucene turning
585# "...farming</p>\n<p>EDWARD.." into "farmingedward"
586# (example from demo collection b20cre)
587# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
588sub preprocess_text
589{
590 my $self = shift (@_);
591 my ($text, $strip_html, $para) = @_;
592 # at this stage, we do not do paragraph tags unless have strip_html -
593 # it will result in a huge mess of non-xml
594 return unless $strip_html;
595
596 my $new_text = $text;
597
598 # if we have <pre> tags, we can have < > inside them, need to delete
599 # the <> before stripping tags
600 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
601
602 if ($para eq "") {
603 # just remove all tags
604 $new_text =~ s/<[^>]*>/ /gs;
605 } else {
606 # strip all tags except <p> tags which get turned into $para
607 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
608 }
609
610 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
611 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
612 $new_text =~ s/&\w{1,10};//g;
613 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
614 $new_text =~ s/&([^\#])/ $1/g;
615
616 return $new_text;
617}
618
619sub delete_assoc_files
620{
621 my $self = shift (@_);
622 my ($archivedir, $edit_mode) = @_;
623
624 $self->basebuildproc::delete_assoc_files(@_);
625
626 if ($edit_mode eq "delete") {
627 # if we are deleting the doc, then also delete the lucene text version
628 my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
629 if (-d $assoc_dir) {
630 &util::rm_r($assoc_dir);
631 }
632 }
633}
634
635sub create_sortfield_shortname {
636 my $self = shift(@_);
637
638 my ($realname) = @_;
639
640 my $index_shortname;
641 # if we have created a shortname for an index on this field, then use it.
642 if (defined $self->{'fieldnamemap'}->{$realname}) {
643 $index_shortname = $self->{'fieldnamemap'}->{$realname};
644 } else {
645 $index_shortname = $self->create_shortname($realname);
646 }
647 return "by".$index_shortname;
648}
649
650
6511;
652
653
Note: See TracBrowser for help on using the repository browser.