source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 24404

Last change on this file since 24404 was 24404, checked in by ak19, 13 years ago

Changes to perl code to do with removing the ex. prefix: ex. is only removed if it is the sole prefix (i.e. ex.dc.* prefixes are not removed).

  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub textedit {
66 my $self = shift (@_);
67 my ($doc_obj,$file,$edit_mode) = @_;
68
69 my $lucenehandle = $self->{'output_handle'};
70 my $outhandle = $self->{'outhandle'};
71
72 # only output this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 # skip this document if in "compress-text" mode and asked to delete it
76 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80 # this is another document
81 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
82 $self->{'num_docs'} += 1;
83 }
84 else {
85 $self->{'num_docs'} -= 1;
86 }
87
88 # get the parameters for the output
89 # split on : just in case there is subcoll and lang stuff
90 my ($fields) = split (/:/, $self->{'index'});
91
92 my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94 my $levels = $self->{'levels'};
95 my $ldoc_level = $levels->{'document'};
96 my $lsec_level = $levels->{'section'};
97
98 # gs2_id should be depricated #####
99 my $gs2_id = "";
100 if ($ldoc_level)
101 {
102 if ($self->{'db_level'} eq 'document')
103 {
104 $gs2_id = $self->{'num_docs'};
105 }
106 else
107 {
108 # default is section level
109 $gs2_id = $self->{'num_sections'} + 1;
110 }
111 }
112 my $gs2_docOID = $doc_obj->get_OID();
113 my $documenttag = undef;
114 my $documentendtag = undef;
115
116 #$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117 $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
118 $documentendtag = "\n</$doc_tag_name>\n";
119
120 my $sec_tag_name = "";
121 if ($lsec_level)
122 {
123 $sec_tag_name = $mgppbuildproc::level_map{'section'};
124 }
125
126 my $doc_section = 0; # just for this document
127
128 my $text = "";
129 $text .= $documenttag;
130 # get the text for this document
131 my $section = $doc_obj->get_top_section();
132 while (defined $section)
133 {
134 # update a few statistics
135 $doc_section++;
136 $self->{'num_sections'}++;
137
138 my $sec_gs2_id = $self->{'num_sections'};
139 my $sec_gs2_docOID = $gs2_docOID;
140 $sec_gs2_docOID .= ".$section" if ($section ne "");
141
142 # if we are doing subcollections, then some docs shouldn't be indexed.
143 # but we need to put the section tag placeholders in there so the
144 # sections match up with database
145 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
146 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
147 if ($sec_tag_name ne "") {
148 #$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
149 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
150 $text .= "\n</$sec_tag_name>\n"
151 }
152 $section = $doc_obj->get_next_section($section);
153 next;
154 }
155
156 if ($sec_tag_name ne "")
157 {
158 #$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
159 $text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
160 }
161
162 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
163 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
164 }
165 else {
166 # delete
167 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
168 }
169
170
171 # has the user added a 'metadata' index?
172 my $all_metadata_specified = 0;
173 # which fields have already been indexed? (same as fields, but in a map)
174 my $specified_fields = {};
175
176 # do we have an allfields index??
177 my $allfields_index = 0;
178 # collect up all the text for it in here
179 my $allfields_text = "";
180 foreach my $field (split (/;/, $fields)) {
181 if ($field eq "allfields") {
182 $allfields_index = 1;
183 } elsif ($field eq "metadata") {
184 $all_metadata_specified = 1;
185 }
186 }
187
188 foreach my $field (split (/;/, $fields)) {
189
190 # only deal with this field if it doesn't start with top or
191 # this is the first section
192 my $real_field = $field;
193 next if (($real_field =~ s/^top//) && ($doc_section != 1));
194
195 # process these two later
196 next if ($real_field eq "allfields" || $real_field eq "metadata");
197
198 #individual metadata and or text specified - could be a comma separated list
199 $specified_fields->{$real_field} = 1;
200 my $shortname="";
201 my $new_field = 0; # have we found a new field name?
202 if (defined $self->{'indexfieldmap'}->{$real_field}) {
203 $shortname = $self->{'indexfieldmap'}->{$real_field};
204 }
205 else {
206 $shortname = $self->create_shortname($real_field);
207 $new_field = 1;
208 }
209
210 my @metadata_list = (); # put any metadata values in here
211 my $section_text = ""; # put the text in here
212 foreach my $submeta (split /,/, $real_field) {
213 if ($submeta eq "text") {
214 # no point in indexing text more than once
215 if ($section_text eq "") {
216 $section_text = $doc_obj->get_text($section);
217 if ($self->{'indexing_text'}) {
218 # we always strip html
219 $section_text = $self->preprocess_text($section_text, 1, "");
220 }
221 else {
222 # leave html stuff in, but escape the tags
223 &ghtml::htmlsafe($section_text);
224 }
225 }
226 }
227 else {
228 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
229
230 # its a metadata element
231 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
232 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
233 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
234 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
235 }
236 }
237 push (@metadata_list, @section_metadata);
238 }
239 } # for each field in this one index
240
241 # now we add the text and/or metadata into new_text
242 if ($section_text ne "" || scalar(@metadata_list)) {
243 my $new_text = "";
244
245 if ($section_text ne "") {
246 $new_text .= "$section_text ";
247 }
248
249 foreach my $item (@metadata_list) {
250 &ghtml::htmlsafe($item);
251 $new_text .= "$item ";
252 }
253
254 if ($allfields_index) {
255 $allfields_text .= $new_text;
256 }
257
258 if ($self->{'indexing_text'}) {
259 # add the tag
260 $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
261 }
262 # filter the text
263 $new_text = $self->filter_text ($field, $new_text);
264
265 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
266 $self->{'num_processed_bytes'} += length ($new_text);
267 $text .= "$new_text";
268 }
269 else {
270 # delete
271 $self->{'num_processed_bytes'} -= length ($new_text);
272 }
273
274
275 if ($self->{'indexing_text'} && $new_field) {
276 # we need to add to the list in indexfields
277
278 $self->{'indexfieldmap'}->{$real_field} = $shortname;
279 $self->{'indexfieldmap'}->{$shortname} = 1;
280 }
281
282 }
283
284 } # foreach field
285
286
287 if ($all_metadata_specified) {
288
289 my $new_text = "";
290 my $shortname = "";
291 my $metadata = $doc_obj->get_all_metadata ($section);
292 foreach my $pair (@$metadata) {
293 my ($mfield, $mvalue) = (@$pair);
294 # no value
295 next unless defined $mvalue && $mvalue ne "";
296 # we have already indexed this
297 next if defined ($specified_fields->{$mfield});
298 # check fields here, maybe others dont want - change to use dontindex!!
299 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
300 next if ($mfield =~ /^gsdl/);
301
302 &ghtml::htmlsafe($mvalue);
303
304 if (defined $self->{'indexfieldmap'}->{$mfield}) {
305 $shortname = $self->{'indexfieldmap'}->{$mfield};
306 }
307 else {
308 $shortname = $self->create_shortname($mfield);
309 $self->{'indexfieldmap'}->{$mfield} = $shortname;
310 $self->{'indexfieldmap'}->{$shortname} = 1;
311 }
312 $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
313 if ($allfields_index) {
314 $allfields_text .= "$mvalue ";
315 }
316
317 if (!defined $self->{'indexfields'}->{$mfield}) {
318 $self->{'indexfields'}->{$mfield} = 1;
319 }
320
321 }
322 # filter the text
323 $new_text = $self->filter_text ("metadata", $new_text);
324
325 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
326 $self->{'num_processed_bytes'} += length ($new_text);
327 $text .= "$new_text";
328 }
329 else {
330 # delete
331 $self->{'num_processed_bytes'} -= length ($new_text);
332 }
333 }
334
335 if ($allfields_index) {
336 # add the index name mapping
337 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
338 $self->{'indexfieldmap'}->{"ZZ"} = 1;
339
340 my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
341 # filter the text
342 $new_text = $self->filter_text ("allfields", $new_text);
343
344 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
345 $self->{'num_processed_bytes'} += length ($new_text);
346 $text .= "$new_text";
347 }
348 else {
349 # delete
350 $self->{'num_processed_bytes'} -= length ($new_text);
351 }
352 }
353
354 $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
355
356 $section = $doc_obj->get_next_section($section);
357 } # while defined section
358
359 print $lucenehandle "$text\n$documentendtag";
360
361## if ($edit_mode eq "delete") {
362## print STDERR "$text\n$documentendtag";
363## }
364
365}
366
367sub text {
368 my $self = shift (@_);
369 my ($doc_obj,$file) = @_;
370
371 $self->textedit($doc_obj,$file,"add");
372}
373
374sub textreindex
375{
376 my $self = shift (@_);
377 my ($doc_obj,$file) = @_;
378
379 $self->textedit($doc_obj,$file,"update");
380}
381
382sub textdelete
383{
384 my $self = shift (@_);
385 my ($doc_obj,$file) = @_;
386
387 $self->textedit($doc_obj,$file,"delete");
388}
389
390
391
392
393
394# /** We make this builder pretend to be a document processor so we can get
395# * information back from the plugins.
396# *
397# * @param $self A reference to this Lucene builder
398# * @param $doc_obj A reference to a document object representing what was
399# * parsed by the GAPlug
400# * @param $file The name of the file parsed as a string
401# *
402# * @author John Thompson, DL Consulting Ltd
403# */
404sub process()
405 {
406 my $self = shift (@_);
407 my ($doc_obj, $file) = @_;
408
409 # If this is called from any stage other than an incremental infodb we want
410 # to pass through to the superclass of build
411 if ($self->get_mode() eq "incinfodb")
412 {
413 print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
414 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
415 $archivedir = "" unless defined $archivedir;
416 $archivedir =~ s/\\/\//g;
417 $archivedir =~ s/^\/+//;
418 $archivedir =~ s/\/+$//;
419
420 # Number of files
421 print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
422
423 # resolve the final filenames of the files associated with this document
424 $self->assoc_files ($doc_obj, $archivedir);
425
426 # is this a paged or a hierarchical document
427 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
428
429 # Determine the actual docnum by checking if we've processed any
430 # previous incrementally added documents. If so, carry on from there.
431 # Otherwise we set the counter to be the same as the number of
432 # sections encountered during the previous build
433 if ($self->{'numincdocs'} == 0)
434 {
435 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
436 }
437
438 my $section = $doc_obj->get_top_section ();
439 print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
440 my $doc_OID = $doc_obj->get_OID();
441 my $url = "";
442 while (defined $section)
443 {
444 print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
445 # Attach all the other metadata to this document
446 # output the fact that this document is a document (unless doctype
447 # has been set to something else from within a plugin
448 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
449 if (!defined $dtype || $dtype !~ /\w/)
450 {
451 #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
452 $doc_obj->add_utf8_metadata($section, "doctype", "doc");
453 }
454 # output whether this node contains text
455 if ($doc_obj->get_text_length($section) > 0)
456 {
457 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
458 }
459 else
460 {
461 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
462 }
463
464 # output archivedir if at top level
465 if ($section eq $doc_obj->get_top_section())
466 {
467 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
468 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
469 }
470
471 # output a list of children
472 my $children = $doc_obj->get_children ($section);
473 if (scalar(@$children) > 0)
474 {
475 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
476 my @contains = ();
477 foreach my $child (@$children)
478 {
479 if ($child =~ /^.*?\.(\d+)$/)
480 {
481 push (@contains, "\".$1");
482 }
483 else
484 {
485 push (@contains, "\".$child");
486 }
487 }
488 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
489 }
490 #output the matching doc number
491 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
492 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
493
494 $self->{'numincdocs'}++;
495 $section = $doc_obj->get_next_section($section);
496 # if no sections wanted, only add the docs
497 last if ($self->{'db_level'} eq "document");
498 }
499 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
500 &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
501 }
502 else
503 {
504 $self->mgppbuildproc::process(@_);
505 }
506 }
507# /** process() **/
508
509
510# Following methods seem to be no different to those defined in basebuildproc.pm
511# From inspection, it looks like these ones can be removed
512
513
514sub get_num_docs {
515 my $self = shift (@_);
516 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
517 return $self->{'num_docs'};
518}
519
520sub get_num_sections {
521 my $self = shift (@_);
522 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
523 return $self->{'num_sections'};
524}
525
526# num_bytes is the actual number of bytes in the collection
527# this is normally the same as what's processed during text compression
528sub get_num_bytes {
529 my $self = shift (@_);
530 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
531 return $self->{'num_bytes'};
532}
533
534
535# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
536# Otherwise the removal of tags below might lead to Lucene turning
537# "...farming</p>\n<p>EDWARD.." into "farmingedward"
538# (example from demo collection b20cre)
539# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
540sub preprocess_text
541{
542 my $self = shift (@_);
543 my ($text, $strip_html, $para) = @_;
544 # at this stage, we do not do paragraph tags unless have strip_html -
545 # it will result in a huge mess of non-xml
546 return unless $strip_html;
547
548 my $new_text = $text;
549
550 # if we have <pre> tags, we can have < > inside them, need to delete
551 # the <> before stripping tags
552 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
553
554 if ($para eq "") {
555 # just remove all tags
556 $new_text =~ s/<[^>]*>/ /gs;
557 } else {
558 # strip all tags except <p> tags which get turned into $para
559 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
560 }
561
562 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
563 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
564 $new_text =~ s/&\w{1,10};//g;
565 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
566 $new_text =~ s/&([^\#])/ $1/g;
567
568 return $new_text;
569}
570
571sub delete_assoc_files
572{
573 my $self = shift (@_);
574 my ($archivedir, $edit_mode) = @_;
575
576 $self->basebuildproc::delete_assoc_files(@_);
577
578 if ($edit_mode eq "delete") {
579 # if we are deleting the doc, then also delete the lucene text version
580 my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
581 if (-d $assoc_dir) {
582 &util::rm_r($assoc_dir);
583 }
584 }
585}
5861;
587
588
Note: See TracBrowser for help on using the repository browser.