source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 27565

Last change on this file since 27565 was 27565, checked in by kjdon, 11 years ago

ignore special keywords which should be only in indexes list, and ignore sort special keyword 'rank'

  • Property svn:keywords set to Author Date Id Revision
File size: 21.3 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51 $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
52 $self->{'allfields_index'} = 0; # do we need allfields index?
53 $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
54 $self->{'actualsortfields'} = {}; # sort fields that have actually been used
55 $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
56 return bless $self, $class;
57}
58
59sub set_index {
60 my $self = shift (@_);
61 my ($index, $indexexparr) = @_;
62
63 $self->mgppbuildproc::set_index($index, $indexexparr);
64
65 # just get the list of index fields without any subcoll stuff
66 my ($fields) = split (/:/, $self->{'index'});
67
68 foreach my $field (split (/;/, $fields)) {
69 if ($field eq "allfields") {
70 $self->{'allfields_index'} = 1;
71 } elsif ($field eq "metadata") {
72 $self->{'all_metadata_specified'} = 1;
73 } else {
74 $field =~ s/^top//;
75 $self->{'specified_fields'} ->{$field} = 1;
76 }
77 }
78}
79
80sub set_sections_sort_on_document_metadata {
81 my $self= shift (@_);
82 my ($index_type) = @_;
83
84 $self->{'sections_sort_on_document_metadata'} = $index_type;
85}
86
87sub set_sortfields {
88 my $self = shift (@_);
89
90 my ($sortfields) = @_;
91 $self->{'sortfields'} = ();
92 # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
93 foreach my $s (@$sortfields) {
94 if ($s !~ /^(text|allfields|metadata)$/) {
95 push (@{$self->{'sortfields'}}, $s);
96 }
97 }
98}
99
100sub is_incremental_capable
101{
102 my $self = shift (@_);
103
104 # Unlike MG and MGPP, Lucene supports incremental building
105 return 1;
106}
107
108
109sub textedit {
110 my $self = shift (@_);
111 my ($doc_obj,$file,$edit_mode) = @_;
112
113 my $lucenehandle = $self->{'output_handle'};
114 my $outhandle = $self->{'outhandle'};
115
116 # only output this document if it is one to be indexed
117 return if ($doc_obj->get_doc_type() ne "indexed_doc");
118
119 # skip this document if in "compress-text" mode and asked to delete it
120 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
121
122 # 0/1 to indicate whether this doc is part of the specified subcollection
123 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
124
125 # this is another document
126 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
127 $self->{'num_docs'} += 1;
128 }
129 else {
130 $self->{'num_docs'} -= 1;
131 }
132
133
134 # get the parameters for the output
135 # split on : just in case there is subcoll and lang stuff
136 my ($fields) = split (/:/, $self->{'index'});
137
138 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
139
140 my $levels = $self->{'levels'};
141 my $ldoc_level = $levels->{'document'};
142 my $lsec_level = $levels->{'section'};
143
144 my $gs2_docOID = $doc_obj->get_OID();
145 my $documenttag = undef;
146 my $documentendtag = undef;
147
148 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
149 $documentendtag = "\n</$doc_tag_name>\n";
150
151 my $sec_tag_name = "";
152 if ($lsec_level)
153 {
154 $sec_tag_name = $mgppbuildproc::level_map{'section'};
155 }
156
157 my $doc_section = 0; # just for this document
158
159 my $text = "";
160 $text .= $documenttag;
161 # get the text for this document
162 my $section = $doc_obj->get_top_section();
163 while (defined $section)
164 {
165 # update a few statistics
166 $doc_section++;
167 $self->{'num_sections'}++;
168
169 my $sec_gs2_id = $self->{'num_sections'};
170 my $sec_gs2_docOID = $gs2_docOID;
171 $sec_gs2_docOID .= ".$section" if ($section ne "");
172
173 # if we are doing subcollections, then some docs shouldn't be indexed.
174 # but we need to put the section tag placeholders in there so the
175 # sections match up with database
176 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
177 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
178 if ($sec_tag_name ne "") {
179 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
180 $text .= "\n</$sec_tag_name>\n"
181 }
182 $section = $doc_obj->get_next_section($section);
183 next;
184 }
185
186 if ($sec_tag_name ne "")
187 {
188 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
189 }
190
191 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
192 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
193 }
194 else {
195 # delete
196 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
197 }
198
199
200 # collect up all the text for allfields index in here (if there is one)
201 my $allfields_text = "";
202
203 foreach my $field (split (/;/, $fields)) {
204
205 # only deal with this field if it doesn't start with top or
206 # this is the first section
207 my $real_field = $field;
208 next if (($real_field =~ s/^top//) && ($doc_section != 1));
209
210 # process these two later
211 next if ($real_field eq "allfields" || $real_field eq "metadata");
212
213 #individual metadata and or text specified - could be a comma separated list
214 #$specified_fields->{$real_field} = 1;
215 my $shortname="";
216 my $new_field = 0; # have we found a new field name?
217 if (defined $self->{'fieldnamemap'}->{$real_field}) {
218 $shortname = $self->{'fieldnamemap'}->{$real_field};
219 } else {
220 $shortname = $self->create_shortname($real_field);
221 $self->{'fieldnamemap'}->{$real_field} = $shortname;
222 $self->{'fieldnamemap'}->{$shortname} = 1;
223 }
224 my @metadata_list = (); # put any metadata values in here
225 my $section_text = ""; # put the text in here
226 foreach my $submeta (split /,/, $real_field) {
227 if ($submeta eq "text") {
228 # no point in indexing text more than once
229 if ($section_text eq "") {
230 $section_text = $doc_obj->get_text($section);
231 if ($self->{'indexing_text'}) {
232 # we always strip html
233 $section_text = $self->preprocess_text($section_text, 1, "");
234 }
235 else {
236 # leave html stuff in, but escape the tags
237 &ghtml::htmlsafe($section_text);
238 }
239 }
240 }
241 else {
242 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
243
244 # its a metadata element
245 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
246 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
247 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
248 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
249 }
250 }
251 push (@metadata_list, @section_metadata);
252 }
253 } # for each field in this one index
254
255
256 # now we add the text and/or metadata into new_text
257 if ($section_text ne "" || scalar(@metadata_list)) {
258 my $new_text = "";
259
260 if ($section_text ne "") {
261 $new_text .= "$section_text ";
262 }
263
264 foreach my $item (@metadata_list) {
265 &ghtml::htmlsafe($item);
266 $new_text .= "$item ";
267 }
268
269 if ($self->{'allfields_index'}) {
270 $allfields_text .= $new_text;
271 }
272
273 if ($self->{'indexing_text'}) {
274 # add the tag
275 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
276 $self->{'allindexfields'}->{$real_field} = 1;
277 }
278 # filter the text
279 $new_text = $self->filter_text ($field, $new_text);
280
281 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
282 $self->{'num_processed_bytes'} += length ($new_text);
283 $text .= "$new_text";
284 }
285 else {
286 # delete
287 $self->{'num_processed_bytes'} -= length ($new_text);
288 }
289 }
290
291 } # foreach field
292
293 if ($self->{'all_metadata_specified'}) {
294
295 my $new_text = "";
296 my $shortname = "";
297 my $metadata = $doc_obj->get_all_metadata ($section);
298 foreach my $pair (@$metadata) {
299 my ($mfield, $mvalue) = (@$pair);
300 # no value
301 next unless defined $mvalue && $mvalue ne "";
302 # we have already indexed this
303 next if defined ($self->{'specified_fields'}->{$mfield});
304 # check fields here, maybe others dont want - change to use dontindex!!
305 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
306 next if ($mfield =~ /^gsdl/);
307
308 &ghtml::htmlsafe($mvalue);
309
310 if (defined $self->{'fieldnamemap'}->{$mfield}) {
311 $shortname = $self->{'fieldnamemap'}->{$mfield};
312 }
313 else {
314 $shortname = $self->create_shortname($mfield);
315 $self->{'fieldnamemap'}->{$mfield} = $shortname;
316 $self->{'fieldnamemap'}->{$shortname} = 1;
317 }
318 $self->{'allindexfields'}->{$mfield} = 1;
319 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
320 if ($self->{'allfields_index'}) {
321 $allfields_text .= "$mvalue ";
322 }
323
324 if (!defined $self->{'extraindexfields'}->{$mfield}) {
325 $self->{'extraindexfields'}->{$mfield} = 1;
326 }
327
328 }
329 # filter the text
330 $new_text = $self->filter_text ("metadata", $new_text);
331
332 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
333 $self->{'num_processed_bytes'} += length ($new_text);
334 $text .= "$new_text";
335 }
336 else {
337 # delete
338 $self->{'num_processed_bytes'} -= length ($new_text);
339 }
340 }
341
342 if ($self->{'allfields_index'}) {
343
344 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
345 # filter the text
346 $new_text = $self->filter_text ("allfields", $new_text);
347
348 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
349 $self->{'num_processed_bytes'} += length ($new_text);
350 $text .= "$new_text";
351 }
352 else {
353 # delete
354 $self->{'num_processed_bytes'} -= length ($new_text);
355 }
356 }
357 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
358 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
359 # add sort fields if there are any
360
361 foreach my $sfield (@{$self->{'sortfields'}}) {
362 # ignore special field rank
363 next if $sfield eq "rank";
364 my $sf_shortname;
365 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
366 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
367 }
368 else {
369 $sf_shortname = $self->create_sortfield_shortname($sfield);
370 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
371 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
372 }
373 my @metadata_list = (); # put any metadata values in here
374 foreach my $submeta (split /,/, $sfield) {
375 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
376
377 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
378 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
379 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
380 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
381 }
382 }
383 push (@metadata_list, @section_metadata);
384 }
385 my $new_text = "";
386 foreach my $item (@metadata_list) {
387 &ghtml::htmlsafe($item);
388 $new_text .= "$item";
389 }
390 if ($new_text =~ /\S/) {
391 $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
392 # filter the text???
393 $text .= "$new_text"; # add it to the main text block
394 $self->{'actualsortfields'}->{$sfield} = 1;
395 }
396 }
397 }
398 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
399
400 $section = $doc_obj->get_next_section($section);
401 } # for each section
402
403 #open (TEXTOUT, ">text.out");
404 #print TEXTOUT "$text\n$documentendtag";
405 #close TEXTOUT;
406
407 print $lucenehandle "$text\n$documentendtag";
408
409## if ($edit_mode eq "delete") {
410## print STDERR "$text\n$documentendtag";
411## }
412
413}
414
415sub text {
416 my $self = shift (@_);
417 my ($doc_obj,$file) = @_;
418
419 $self->textedit($doc_obj,$file,"add");
420}
421
422sub textreindex
423{
424 my $self = shift (@_);
425 my ($doc_obj,$file) = @_;
426
427 $self->textedit($doc_obj,$file,"update");
428}
429
430sub textdelete
431{
432 my $self = shift (@_);
433 my ($doc_obj,$file) = @_;
434
435 $self->textedit($doc_obj,$file,"delete");
436}
437
438
439
440
441
442# /** We make this builder pretend to be a document processor so we can get
443# * information back from the plugins.
444# *
445# * @param $self A reference to this Lucene builder
446# * @param $doc_obj A reference to a document object representing what was
447# * parsed by the GAPlug
448# * @param $file The name of the file parsed as a string
449# *
450# * @author John Thompson, DL Consulting Ltd
451# */
452sub process()
453 {
454 my $self = shift (@_);
455 my ($doc_obj, $file) = @_;
456
457 # If this is called from any stage other than an incremental infodb we want
458 # to pass through to the superclass of build
459 if ($self->get_mode() eq "incinfodb")
460 {
461 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
462 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
463 $archivedir = "" unless defined $archivedir;
464 $archivedir =~ s/\\/\//g;
465 $archivedir =~ s/^\/+//;
466 $archivedir =~ s/\/+$//;
467
468 # Number of files
469 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
470
471 # resolve the final filenames of the files associated with this document
472 $self->assoc_files ($doc_obj, $archivedir);
473
474 # is this a paged or a hierarchical document
475 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
476
477 # Determine the actual docnum by checking if we've processed any
478 # previous incrementally added documents. If so, carry on from there.
479 # Otherwise we set the counter to be the same as the number of
480 # sections encountered during the previous build
481 if ($self->{'numincdocs'} == 0)
482 {
483 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
484 }
485
486 my $section = $doc_obj->get_top_section ();
487 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
488 my $doc_OID = $doc_obj->get_OID();
489 my $url = "";
490 while (defined $section)
491 {
492 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
493 # Attach all the other metadata to this document
494 # output the fact that this document is a document (unless doctype
495 # has been set to something else from within a plugin
496 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
497 if (!defined $dtype || $dtype !~ /\w/)
498 {
499 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
500 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
501 }
502 # output whether this node contains text
503 if ($doc_obj->get_text_length($section) > 0)
504 {
505 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
506 }
507 else
508 {
509 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
510 }
511
512 # output archivedir if at top level
513 if ($section eq $doc_obj->get_top_section())
514 {
515 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
516 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
517 }
518
519 # output a list of children
520 my $children = $doc_obj->get_children ($section);
521 if (scalar(@$children) > 0)
522 {
523 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
524 my @contains = ();
525 foreach my $child (@$children)
526 {
527 if ($child =~ /^.*?\.(\d+)$/)
528 {
529 push (@contains, "\".$1");
530 }
531 else
532 {
533 push (@contains, "\".$child");
534 }
535 }
536 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
537 }
538 #output the matching doc number
539 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
540 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
541
542 $self->{'numincdocs'}++;
543 $section = $doc_obj->get_next_section($section);
544 # if no sections wanted, only add the docs
545 last if ($self->{'db_level'} eq "document");
546 }
547 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
548 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
549 }
550 else
551 {
552 $self->mgppbuildproc::process(@_);
553 }
554 }
555# /** process() **/
556
557
558# Following methods seem to be no different to those defined in basebuildproc.pm
559# From inspection, it looks like these ones can be removed
560
561
562sub get_num_docs {
563 my $self = shift (@_);
564 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
565 return $self->{'num_docs'};
566}
567
568sub get_num_sections {
569 my $self = shift (@_);
570 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
571 return $self->{'num_sections'};
572}
573
574# num_bytes is the actual number of bytes in the collection
575# this is normally the same as what's processed during text compression
576sub get_num_bytes {
577 my $self = shift (@_);
578 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
579 return $self->{'num_bytes'};
580}
581
582
583# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
584# Otherwise the removal of tags below might lead to Lucene turning
585# "...farming</p>\n<p>EDWARD.." into "farmingedward"
586# (example from demo collection b20cre)
587# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
588sub preprocess_text
589{
590 my $self = shift (@_);
591 my ($text, $strip_html, $para) = @_;
592 # at this stage, we do not do paragraph tags unless have strip_html -
593 # it will result in a huge mess of non-xml
594 return unless $strip_html;
595
596 my $new_text = $text;
597
598 # if we have <pre> tags, we can have < > inside them, need to delete
599 # the <> before stripping tags
600 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
601
602 if ($para eq "") {
603 # just remove all tags
604 $new_text =~ s/<[^>]*>/ /gs;
605 } else {
606 # strip all tags except <p> tags which get turned into $para
607 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
608 }
609
610 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
611 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
612 $new_text =~ s/&\w{1,10};//g;
613 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
614 $new_text =~ s/&([^\#])/ $1/g;
615
616 return $new_text;
617}
618
619sub delete_assoc_files
620{
621 my $self = shift (@_);
622 my ($archivedir, $edit_mode) = @_;
623
624 $self->basebuildproc::delete_assoc_files(@_);
625
626 if ($edit_mode eq "delete") {
627 # if we are deleting the doc, then also delete the lucene text version
628 my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
629 if (-d $assoc_dir) {
630 &util::rm_r($assoc_dir);
631 }
632 }
633}
634
635sub create_sortfield_shortname {
636 my $self = shift(@_);
637
638 my ($realname) = @_;
639
640 my $index_shortname;
641 # if we have created a shortname for an index on this field, then use it.
642 if (defined $self->{'fieldnamemap'}->{$realname}) {
643 $index_shortname = $self->{'fieldnamemap'}->{$realname};
644 } else {
645 $index_shortname = $self->create_shortname($realname);
646 }
647 return "by".$index_shortname;
648}
649
650
6511;
652
653
Note: See TracBrowser for help on using the repository browser.