source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 32524

Last change on this file since 32524 was 28566, checked in by kjdon, 10 years ago

changed some util:: to FIleUtils:: methods to avoid warnings

  • Property svn:keywords set to Author Date Id Revision
File size: 21.4 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40use FileUtils;
41
42sub BEGIN {
43 @lucenebuildproc::ISA = ('mgppbuildproc');
44}
45
46
47sub new {
48 my $class = shift @_;
49 my $self = new mgppbuildproc (@_);
50
51 $self->{'numincdocs'} = 0;
52 $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
53 $self->{'allfields_index'} = 0; # do we need allfields index?
54 $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
55 $self->{'actualsortfields'} = {}; # sort fields that have actually been used
56 $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
57 return bless $self, $class;
58}
59
60sub set_index {
61 my $self = shift (@_);
62 my ($index, $indexexparr) = @_;
63
64 $self->mgppbuildproc::set_index($index, $indexexparr);
65
66 # just get the list of index fields without any subcoll stuff
67 my ($fields) = split (/:/, $self->{'index'});
68
69 foreach my $field (split (/;/, $fields)) {
70 if ($field eq "allfields") {
71 $self->{'allfields_index'} = 1;
72 } elsif ($field eq "metadata") {
73 $self->{'all_metadata_specified'} = 1;
74 } else {
75 $field =~ s/^top//;
76 $self->{'specified_fields'} ->{$field} = 1;
77 }
78 }
79}
80
81sub set_sections_sort_on_document_metadata {
82 my $self= shift (@_);
83 my ($index_type) = @_;
84
85 $self->{'sections_sort_on_document_metadata'} = $index_type;
86}
87
88sub set_sortfields {
89 my $self = shift (@_);
90
91 my ($sortfields) = @_;
92 $self->{'sortfields'} = ();
93 # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
94 foreach my $s (@$sortfields) {
95 if ($s !~ /^(text|allfields|metadata)$/) {
96 push (@{$self->{'sortfields'}}, $s);
97 }
98 }
99}
100
101sub is_incremental_capable
102{
103 my $self = shift (@_);
104
105 # Unlike MG and MGPP, Lucene supports incremental building
106 return 1;
107}
108
109
110sub textedit {
111 my $self = shift (@_);
112 my ($doc_obj,$file,$edit_mode) = @_;
113
114 my $lucenehandle = $self->{'output_handle'};
115 my $outhandle = $self->{'outhandle'};
116
117 # only output this document if it is one to be indexed
118 return if ($doc_obj->get_doc_type() ne "indexed_doc");
119
120 # skip this document if in "compress-text" mode and asked to delete it
121 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
122
123 # 0/1 to indicate whether this doc is part of the specified subcollection
124 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
125
126 # this is another document
127 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
128 $self->{'num_docs'} += 1;
129 }
130 else {
131 $self->{'num_docs'} -= 1;
132 }
133
134
135 # get the parameters for the output
136 # split on : just in case there is subcoll and lang stuff
137 my ($fields) = split (/:/, $self->{'index'});
138
139 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
140
141 my $levels = $self->{'levels'};
142 my $ldoc_level = $levels->{'document'};
143 my $lsec_level = $levels->{'section'};
144
145 my $gs2_docOID = $doc_obj->get_OID();
146 my $documenttag = undef;
147 my $documentendtag = undef;
148
149 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
150 $documentendtag = "\n</$doc_tag_name>\n";
151
152 my $sec_tag_name = "";
153 if ($lsec_level)
154 {
155 $sec_tag_name = $mgppbuildproc::level_map{'section'};
156 }
157
158 my $doc_section = 0; # just for this document
159
160 my $text = "";
161 $text .= $documenttag;
162 # get the text for this document
163 my $section = $doc_obj->get_top_section();
164 while (defined $section)
165 {
166 # update a few statistics
167 $doc_section++;
168 $self->{'num_sections'}++;
169
170 my $sec_gs2_id = $self->{'num_sections'};
171 my $sec_gs2_docOID = $gs2_docOID;
172 $sec_gs2_docOID .= ".$section" if ($section ne "");
173
174 # if we are doing subcollections, then some docs shouldn't be indexed.
175 # but we need to put the section tag placeholders in there so the
176 # sections match up with database
177 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
178 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
179 if ($sec_tag_name ne "") {
180 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
181 $text .= "\n</$sec_tag_name>\n"
182 }
183 $section = $doc_obj->get_next_section($section);
184 next;
185 }
186
187 if ($sec_tag_name ne "")
188 {
189 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
190 }
191
192 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
193 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
194 }
195 else {
196 # delete
197 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
198 }
199
200
201 # collect up all the text for allfields index in here (if there is one)
202 my $allfields_text = "";
203
204 foreach my $field (split (/;/, $fields)) {
205
206 # only deal with this field if it doesn't start with top or
207 # this is the first section
208 my $real_field = $field;
209 next if (($real_field =~ s/^top//) && ($doc_section != 1));
210
211 # process these two later
212 next if ($real_field eq "allfields" || $real_field eq "metadata");
213
214 #individual metadata and or text specified - could be a comma separated list
215 #$specified_fields->{$real_field} = 1;
216 my $shortname="";
217 my $new_field = 0; # have we found a new field name?
218 if (defined $self->{'fieldnamemap'}->{$real_field}) {
219 $shortname = $self->{'fieldnamemap'}->{$real_field};
220 } else {
221 $shortname = $self->create_shortname($real_field);
222 $self->{'fieldnamemap'}->{$real_field} = $shortname;
223 $self->{'fieldnamemap'}->{$shortname} = 1;
224 }
225 my @metadata_list = (); # put any metadata values in here
226 my $section_text = ""; # put the text in here
227 foreach my $submeta (split /,/, $real_field) {
228 if ($submeta eq "text") {
229 # no point in indexing text more than once
230 if ($section_text eq "") {
231 $section_text = $doc_obj->get_text($section);
232 if ($self->{'indexing_text'}) {
233 # we always strip html
234 $section_text = $self->preprocess_text($section_text, 1, "");
235 }
236 else {
237 # leave html stuff in, but escape the tags
238 &ghtml::htmlsafe($section_text);
239 }
240 }
241 }
242 else {
243 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
244
245 # its a metadata element
246 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
247 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
248 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
249 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
250 }
251 }
252 push (@metadata_list, @section_metadata);
253 }
254 } # for each field in this one index
255
256
257 # now we add the text and/or metadata into new_text
258 if ($section_text ne "" || scalar(@metadata_list)) {
259 my $new_text = "";
260
261 if ($section_text ne "") {
262 $new_text .= "$section_text ";
263 }
264
265 foreach my $item (@metadata_list) {
266 &ghtml::htmlsafe($item);
267 $new_text .= "$item ";
268 }
269
270 if ($self->{'allfields_index'}) {
271 $allfields_text .= $new_text;
272 }
273
274 if ($self->{'indexing_text'}) {
275 # add the tag
276 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
277 $self->{'allindexfields'}->{$real_field} = 1;
278 }
279 # filter the text
280 $new_text = $self->filter_text ($field, $new_text);
281
282 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
283 $self->{'num_processed_bytes'} += length ($new_text);
284 $text .= "$new_text";
285 }
286 else {
287 # delete
288 $self->{'num_processed_bytes'} -= length ($new_text);
289 }
290 }
291
292 } # foreach field
293
294 if ($self->{'all_metadata_specified'}) {
295
296 my $new_text = "";
297 my $shortname = "";
298 my $metadata = $doc_obj->get_all_metadata ($section);
299 foreach my $pair (@$metadata) {
300 my ($mfield, $mvalue) = (@$pair);
301 # no value
302 next unless defined $mvalue && $mvalue ne "";
303 # we have already indexed this
304 next if defined ($self->{'specified_fields'}->{$mfield});
305 # check fields here, maybe others dont want - change to use dontindex!!
306 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
307 next if ($mfield =~ /^gsdl/);
308
309 &ghtml::htmlsafe($mvalue);
310
311 if (defined $self->{'fieldnamemap'}->{$mfield}) {
312 $shortname = $self->{'fieldnamemap'}->{$mfield};
313 }
314 else {
315 $shortname = $self->create_shortname($mfield);
316 $self->{'fieldnamemap'}->{$mfield} = $shortname;
317 $self->{'fieldnamemap'}->{$shortname} = 1;
318 }
319 $self->{'allindexfields'}->{$mfield} = 1;
320 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
321 if ($self->{'allfields_index'}) {
322 $allfields_text .= "$mvalue ";
323 }
324
325 if (!defined $self->{'extraindexfields'}->{$mfield}) {
326 $self->{'extraindexfields'}->{$mfield} = 1;
327 }
328
329 }
330 # filter the text
331 $new_text = $self->filter_text ("metadata", $new_text);
332
333 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
334 $self->{'num_processed_bytes'} += length ($new_text);
335 $text .= "$new_text";
336 }
337 else {
338 # delete
339 $self->{'num_processed_bytes'} -= length ($new_text);
340 }
341 }
342
343 if ($self->{'allfields_index'}) {
344
345 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
346 # filter the text
347 $new_text = $self->filter_text ("allfields", $new_text);
348
349 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
350 $self->{'num_processed_bytes'} += length ($new_text);
351 $text .= "$new_text";
352 }
353 else {
354 # delete
355 $self->{'num_processed_bytes'} -= length ($new_text);
356 }
357 }
358 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
359 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
360 # add sort fields if there are any
361
362 foreach my $sfield (@{$self->{'sortfields'}}) {
363 # ignore special field rank
364 next if ($sfield eq "rank" || $sfield eq "none");
365 my $sf_shortname;
366 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
367 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
368 }
369 else {
370 $sf_shortname = $self->create_sortfield_shortname($sfield);
371 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
372 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
373 }
374 my @metadata_list = (); # put any metadata values in here
375 foreach my $submeta (split /,/, $sfield) {
376 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
377
378 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
379 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
380 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
381 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
382 }
383 }
384 push (@metadata_list, @section_metadata);
385 }
386 my $new_text = "";
387 foreach my $item (@metadata_list) {
388 &ghtml::htmlsafe($item);
389 $new_text .= "$item";
390 }
391 if ($new_text =~ /\S/) {
392 $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
393 # filter the text???
394 $text .= "$new_text"; # add it to the main text block
395 $self->{'actualsortfields'}->{$sfield} = 1;
396 }
397 }
398 }
399 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
400
401 $section = $doc_obj->get_next_section($section);
402 } # for each section
403
404 #open (TEXTOUT, ">text.out");
405 #print TEXTOUT "$text\n$documentendtag";
406 #close TEXTOUT;
407
408 print $lucenehandle "$text\n$documentendtag";
409
410## if ($edit_mode eq "delete") {
411## print STDERR "$text\n$documentendtag";
412## }
413
414}
415
416sub text {
417 my $self = shift (@_);
418 my ($doc_obj,$file) = @_;
419
420 $self->textedit($doc_obj,$file,"add");
421}
422
423sub textreindex
424{
425 my $self = shift (@_);
426 my ($doc_obj,$file) = @_;
427
428 $self->textedit($doc_obj,$file,"update");
429}
430
431sub textdelete
432{
433 my $self = shift (@_);
434 my ($doc_obj,$file) = @_;
435
436 $self->textedit($doc_obj,$file,"delete");
437}
438
439
440
441
442
443# /** We make this builder pretend to be a document processor so we can get
444# * information back from the plugins.
445# *
446# * @param $self A reference to this Lucene builder
447# * @param $doc_obj A reference to a document object representing what was
448# * parsed by the GAPlug
449# * @param $file The name of the file parsed as a string
450# *
451# * @author John Thompson, DL Consulting Ltd
452# */
453sub process()
454 {
455 my $self = shift (@_);
456 my ($doc_obj, $file) = @_;
457
458 # If this is called from any stage other than an incremental infodb we want
459 # to pass through to the superclass of build
460 if ($self->get_mode() eq "incinfodb")
461 {
462 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
463 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
464 $archivedir = "" unless defined $archivedir;
465 $archivedir =~ s/\\/\//g;
466 $archivedir =~ s/^\/+//;
467 $archivedir =~ s/\/+$//;
468
469 # Number of files
470 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
471
472 # resolve the final filenames of the files associated with this document
473 $self->assoc_files ($doc_obj, $archivedir);
474
475 # is this a paged or a hierarchical document
476 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
477
478 # Determine the actual docnum by checking if we've processed any
479 # previous incrementally added documents. If so, carry on from there.
480 # Otherwise we set the counter to be the same as the number of
481 # sections encountered during the previous build
482 if ($self->{'numincdocs'} == 0)
483 {
484 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
485 }
486
487 my $section = $doc_obj->get_top_section ();
488 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
489 my $doc_OID = $doc_obj->get_OID();
490 my $url = "";
491 while (defined $section)
492 {
493 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
494 # Attach all the other metadata to this document
495 # output the fact that this document is a document (unless doctype
496 # has been set to something else from within a plugin
497 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
498 if (!defined $dtype || $dtype !~ /\w/)
499 {
500 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
501 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
502 }
503 # output whether this node contains text
504 if ($doc_obj->get_text_length($section) > 0)
505 {
506 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
507 }
508 else
509 {
510 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
511 }
512
513 # output archivedir if at top level
514 if ($section eq $doc_obj->get_top_section())
515 {
516 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
517 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
518 }
519
520 # output a list of children
521 my $children = $doc_obj->get_children ($section);
522 if (scalar(@$children) > 0)
523 {
524 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
525 my @contains = ();
526 foreach my $child (@$children)
527 {
528 if ($child =~ /^.*?\.(\d+)$/)
529 {
530 push (@contains, "\".$1");
531 }
532 else
533 {
534 push (@contains, "\".$child");
535 }
536 }
537 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
538 }
539 #output the matching doc number
540 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
541 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
542
543 $self->{'numincdocs'}++;
544 $section = $doc_obj->get_next_section($section);
545 # if no sections wanted, only add the docs
546 last if ($self->{'db_level'} eq "document");
547 }
548 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
549 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
550 }
551 else
552 {
553 $self->mgppbuildproc::process(@_);
554 }
555 }
556# /** process() **/
557
558
559# Following methods seem to be no different to those defined in basebuildproc.pm
560# From inspection, it looks like these ones can be removed
561
562
563sub get_num_docs {
564 my $self = shift (@_);
565 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
566 return $self->{'num_docs'};
567}
568
569sub get_num_sections {
570 my $self = shift (@_);
571 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
572 return $self->{'num_sections'};
573}
574
575# num_bytes is the actual number of bytes in the collection
576# this is normally the same as what's processed during text compression
577sub get_num_bytes {
578 my $self = shift (@_);
579 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
580 return $self->{'num_bytes'};
581}
582
583
584# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
585# Otherwise the removal of tags below might lead to Lucene turning
586# "...farming</p>\n<p>EDWARD.." into "farmingedward"
587# (example from demo collection b20cre)
588# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
589sub preprocess_text
590{
591 my $self = shift (@_);
592 my ($text, $strip_html, $para) = @_;
593 # at this stage, we do not do paragraph tags unless have strip_html -
594 # it will result in a huge mess of non-xml
595 return unless $strip_html;
596
597 my $new_text = $text;
598
599 # if we have <pre> tags, we can have < > inside them, need to delete
600 # the <> before stripping tags
601 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
602
603 if ($para eq "") {
604 # just remove all tags
605 $new_text =~ s/<[^>]*>/ /gs;
606 } else {
607 # strip all tags except <p> tags which get turned into $para
608 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
609 }
610
611 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
612 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
613 $new_text =~ s/&\w{1,10};//g;
614 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
615 $new_text =~ s/&([^\#])/ $1/g;
616
617 return $new_text;
618}
619
620sub delete_assoc_files
621{
622 my $self = shift (@_);
623 my ($archivedir, $edit_mode) = @_;
624
625 $self->basebuildproc::delete_assoc_files(@_);
626
627 if ($edit_mode eq "delete") {
628 # if we are deleting the doc, then also delete the lucene text version
629 my $assoc_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"text", $archivedir);
630 if (-d $assoc_dir) {
631 &FileUtils::removeFilesRecursive($assoc_dir);
632 }
633 }
634}
635
636sub create_sortfield_shortname {
637 my $self = shift(@_);
638
639 my ($realname) = @_;
640
641 my $index_shortname;
642 # if we have created a shortname for an index on this field, then use it.
643 if (defined $self->{'fieldnamemap'}->{$realname}) {
644 $index_shortname = $self->{'fieldnamemap'}->{$realname};
645 } else {
646 $index_shortname = $self->create_shortname($realname);
647 }
648 return "by".$index_shortname;
649}
650
651
6521;
653
654
Note: See TracBrowser for help on using the repository browser.