Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 18463

Last change on this file since 18463 was 18456, checked in by davidb, 15 years ago
Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene
Property svn:keywords set to `Author Date Id Revision`
File size: 17.7 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub textedit {
66	my $self = shift (@_);
67	my ($doc_obj,$file,$edit_mode) = @_;
68
69	my $lucenehandle = $self->{'output_handle'};
70	my $outhandle = $self->{'outhandle'};
71
72	# only output this document if it is one to be indexed
73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75	# skip this document if in "compress-text" mode and asked to delete it
76	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80	# this is another document
81	if (($edit_mode eq "add") \|\| ($edit_mode eq "reindex")) {
82	$self->{'num_docs'} += 1;
83	}
84	else {
85	$self->{'num_docs'} -= 1;
86	}
87
88	# get the parameters for the output
89	# split on : just in case there is subcoll and lang stuff
90	my ($fields) = split (/:/, $self->{'index'});
91
92	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94	my $levels = $self->{'levels'};
95	my $ldoc_level = $levels->{'document'};
96	my $lsec_level = $levels->{'section'};
97
98	# gs2_id should be depricated #####
99	my $gs2_id = "";
100	if ($ldoc_level)
101	{
102	if ($self->{'db_level'} eq 'document')
103	{
104	$gs2_id = $self->{'num_docs'};
105	}
106	else
107	{
108	# default is section level
109	$gs2_id = $self->{'num_sections'} + 1;
110	}
111	}
112	my $gs2_docOID = $doc_obj->get_OID();
113	my $documenttag = undef;
114	my $documentendtag = undef;
115
116	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117	$documentendtag = "\n</$doc_tag_name>\n";
118
119	my $sec_tag_name = "";
120	if ($lsec_level)
121	{
122	$sec_tag_name = $mgppbuildproc::level_map{'section'};
123	}
124
125	my $doc_section = 0; # just for this document
126
127	my $text = "";
128	$text .= $documenttag;
129	# get the text for this document
130	my $section = $doc_obj->get_top_section();
131	while (defined $section)
132	{
133	# update a few statistics
134	$doc_section++;
135	$self->{'num_sections'}++;
136
137	my $sec_gs2_id = $self->{'num_sections'};
138	my $sec_gs2_docOID = $gs2_docOID;
139	$sec_gs2_docOID .= ".$section" if ($section ne "");
140
141	# if we are doing subcollections, then some docs shouldn't be indexed.
142	# but we need to put the section tag placeholders in there so the
143	# sections match up with database
144	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
145	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
146	if ($sec_tag_name ne "") {
147	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
148	$text .= "\n</$sec_tag_name>\n"
149	}
150	$section = $doc_obj->get_next_section($section);
151	next;
152	}
153
154	if ($sec_tag_name ne "")
155	{
156	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
157	}
158
159	if (($edit_mode eq "add") \|\| ($edit_mode eq "reindex")) {
160	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
161	}
162	else {
163	# delete
164	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
165	}
166
167
168	# has the user added a 'metadata' index?
169	my $all_metadata_specified = 0;
170	# which fields have already been indexed? (same as fields, but in a map)
171	my $specified_fields = {};
172
173	# do we have an allfields index??
174	my $allfields_index = 0;
175	# collect up all the text for it in here
176	my $allfields_text = "";
177	foreach my $field (split (/;/, $fields)) {
178	if ($field eq "allfields") {
179	$allfields_index = 1;
180	} elsif ($field eq "metadata") {
181	$all_metadata_specified = 1;
182	}
183	}
184
185	foreach my $field (split (/;/, $fields)) {
186
187	# only deal with this field if it doesn't start with top or
188	# this is the first section
189	my $real_field = $field;
190	next if (($real_field =~ s/^top//) && ($doc_section != 1));
191
192	# process these two later
193	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
194
195	#individual metadata and or text specified - could be a comma separated list
196	$specified_fields->{$real_field} = 1;
197	my $shortname="";
198	my $new_field = 0; # have we found a new field name?
199	if (defined $self->{'indexfieldmap'}->{$real_field}) {
200	$shortname = $self->{'indexfieldmap'}->{$real_field};
201	}
202	else {
203	$shortname = $self->create_shortname($real_field);
204	$new_field = 1;
205	}
206
207	my @metadata_list = (); # put any metadata values in here
208	my $section_text = ""; # put the text in here
209	foreach my $submeta (split /,/, $real_field) {
210	if ($submeta eq "text") {
211	# no point in indexing text more than once
212	if ($section_text eq "") {
213	$section_text = $doc_obj->get_text($section);
214	if ($self->{'indexing_text'}) {
215	# we always strip html
216	$section_text = $self->preprocess_text($section_text, 1, "");
217	}
218	else {
219	# leave html stuff in, but escape the tags
220	&ghtml::htmlsafe($section_text);
221	}
222	}
223	}
224	else {
225	# its a metadata element
226	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
227	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
228	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
229	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
230	}
231	}
232	push (@metadata_list, @section_metadata);
233	}
234	} # for each field in this one index
235
236	# now we add the text and/or metadata into new_text
237	if ($section_text ne "" \|\| scalar(@metadata_list)) {
238	my $new_text = "";
239
240	if ($section_text ne "") {
241	$new_text .= "$section_text ";
242	}
243
244	foreach my $item (@metadata_list) {
245	&ghtml::htmlsafe($item);
246	$new_text .= "$item ";
247	}
248
249	if ($allfields_index) {
250	$allfields_text .= $new_text;
251	}
252
253	if ($self->{'indexing_text'}) {
254	# add the tag
255	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
256	}
257	# filter the text
258	$new_text = $self->filter_text ($field, $new_text);
259
260	if (($edit_mode eq "add") \|\| ($edit_mode eq "reindex")) {
261	$self->{'num_processed_bytes'} += length ($new_text);
262	$text .= "$new_text";
263	}
264	else {
265	# delete
266	$self->{'num_processed_bytes'} -= length ($new_text);
267	}
268
269
270	if ($self->{'indexing_text'} && $new_field) {
271	# we need to add to the list in indexfields
272
273	$self->{'indexfieldmap'}->{$real_field} = $shortname;
274	$self->{'indexfieldmap'}->{$shortname} = 1;
275	}
276
277	}
278
279	} # foreach field
280
281
282	if ($all_metadata_specified) {
283
284	my $new_text = "";
285	my $shortname = "";
286	my $metadata = $doc_obj->get_all_metadata ($section);
287	foreach my $pair (@$metadata) {
288	my ($mfield, $mvalue) = (@$pair);
289	# no value
290	next unless defined $mvalue && $mvalue ne "";
291	# we have already indexed this
292	next if defined ($specified_fields->{$mfield});
293	# check fields here, maybe others dont want - change to use dontindex!!
294	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
295	next if ($mfield =~ /^gsdl/);
296
297	&ghtml::htmlsafe($mvalue);
298
299	if (defined $self->{'indexfieldmap'}->{$mfield}) {
300	$shortname = $self->{'indexfieldmap'}->{$mfield};
301	}
302	else {
303	$shortname = $self->create_shortname($mfield);
304	$self->{'indexfieldmap'}->{$mfield} = $shortname;
305	$self->{'indexfieldmap'}->{$shortname} = 1;
306	}
307	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
308	if ($allfields_index) {
309	$allfields_text .= "$mvalue ";
310	}
311
312	if (!defined $self->{'indexfields'}->{$mfield}) {
313	$self->{'indexfields'}->{$mfield} = 1;
314	}
315
316	}
317	# filter the text
318	$new_text = $self->filter_text ("metadata", $new_text);
319
320	if (($edit_mode eq "add") \|\| ($edit_mode eq "reindex")) {
321	$self->{'num_processed_bytes'} += length ($new_text);
322	$text .= "$new_text";
323	}
324	else {
325	# delete
326	$self->{'num_processed_bytes'} -= length ($new_text);
327	}
328	}
329
330	if ($allfields_index) {
331	# add the index name mapping
332	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
333	$self->{'indexfieldmap'}->{"ZZ"} = 1;
334
335	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
336	# filter the text
337	$new_text = $self->filter_text ("allfields", $new_text);
338
339	if (($edit_mode eq "add") \|\| ($edit_mode eq "reindex")) {
340	$self->{'num_processed_bytes'} += length ($new_text);
341	$text .= "$new_text";
342	}
343	else {
344	# delete
345	$self->{'num_processed_bytes'} -= length ($new_text);
346	}
347	}
348
349	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
350
351	$section = $doc_obj->get_next_section($section);
352	} # while defined section
353
354	print $lucenehandle "$text\n$documentendtag";
355
356	## if ($edit_mode eq "delete") {
357	## print STDERR "$text\n$documentendtag";
358	## }
359
360	}
361
362	sub text {
363	my $self = shift (@_);
364	my ($doc_obj,$file) = @_;
365
366	$self->textedit($doc_obj,$file,"add");
367	}
368
369	sub textreindex
370	{
371	my $self = shift (@_);
372	my ($doc_obj,$file) = @_;
373
374	$self->textedit($doc_obj,$file,"reindex");
375	}
376
377	sub textdelete
378	{
379	my $self = shift (@_);
380	my ($doc_obj,$file) = @_;
381
382	$self->textedit($doc_obj,$file,"delete");
383	}
384
385
386
387
388
389	# /** We make this builder pretend to be a document processor so we can get
390	# * information back from the plugins.
391	# *
392	# * @param $self A reference to this Lucene builder
393	# * @param $doc_obj A reference to a document object representing what was
394	# * parsed by the GAPlug
395	# * @param $file The name of the file parsed as a string
396	# *
397	# * @author John Thompson, DL Consulting Ltd
398	# */
399	sub process()
400	{
401	my $self = shift (@_);
402	my ($doc_obj, $file) = @_;
403
404	# If this is called from any stage other than an incremental infodb we want
405	# to pass through to the superclass of build
406	if ($self->get_mode() eq "incinfodb")
407	{
408	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
409	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
410	$archivedir = "" unless defined $archivedir;
411	$archivedir =~ s/\\/\//g;
412	$archivedir =~ s/^\/+//;
413	$archivedir =~ s/\/+$//;
414
415	# Number of files
416	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
417
418	# resolve the final filenames of the files associated with this document
419	$self->assoc_files ($doc_obj, $archivedir);
420
421	# is this a paged or a hierarchical document
422	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
423
424	# Determine the actual docnum by checking if we've processed any
425	# previous incrementally added documents. If so, carry on from there.
426	# Otherwise we set the counter to be the same as the number of
427	# sections encountered during the previous build
428	if ($self->{'numincdocs'} == 0)
429	{
430	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
431	}
432
433	my $section = $doc_obj->get_top_section ();
434	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
435	my $doc_OID = $doc_obj->get_OID();
436	my $url = "";
437	while (defined $section)
438	{
439	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
440	# Attach all the other metadata to this document
441	# output the fact that this document is a document (unless doctype
442	# has been set to something else from within a plugin
443	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
444	if (!defined $dtype \|\| $dtype !~ /\w/)
445	{
446	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
447	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
448	}
449	# output whether this node contains text
450	if ($doc_obj->get_text_length($section) > 0)
451	{
452	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
453	}
454	else
455	{
456	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
457	}
458
459	# output archivedir if at top level
460	if ($section eq $doc_obj->get_top_section())
461	{
462	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
463	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
464	}
465
466	# output a list of children
467	my $children = $doc_obj->get_children ($section);
468	if (scalar(@$children) > 0)
469	{
470	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
471	my @contains = ();
472	foreach my $child (@$children)
473	{
474	if ($child =~ /^.*?\.(\d+)$/)
475	{
476	push (@contains, "\".$1");
477	}
478	else
479	{
480	push (@contains, "\".$child");
481	}
482	}
483	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
484	}
485	#output the matching doc number
486	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
487	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
488
489	$self->{'numincdocs'}++;
490	$section = $doc_obj->get_next_section($section);
491	# if no sections wanted, only add the docs
492	last if ($self->{'db_level'} eq "document");
493	}
494	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
495	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
496	}
497	else
498	{
499	$self->mgppbuildproc::process(@_);
500	}
501	}
502	# / process() /
503
504
505	# Following methods seem to be no different to those defined in basebuildproc.pm
506	# From inspection, it looks like these ones can be removed
507
508
509	sub get_num_docs {
510	my $self = shift (@_);
511	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
512	return $self->{'num_docs'};
513	}
514
515	sub get_num_sections {
516	my $self = shift (@_);
517	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
518	return $self->{'num_sections'};
519	}
520
521	# num_bytes is the actual number of bytes in the collection
522	# this is normally the same as what's processed during text compression
523	sub get_num_bytes {
524	my $self = shift (@_);
525	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
526	return $self->{'num_bytes'};
527	}
528
529
530	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
531	# Otherwise the removal of tags below might lead to Lucene turning
532	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
533	# (example from demo collection b20cre)
534	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
535	sub preprocess_text
536	{
537	my $self = shift (@_);
538	my ($text, $strip_html, $para) = @_;
539	# at this stage, we do not do paragraph tags unless have strip_html -
540	# it will result in a huge mess of non-xml
541	return unless $strip_html;
542
543	my $new_text = $text;
544
545	# if we have <pre> tags, we can have < > inside them, need to delete
546	# the <> before stripping tags
547	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
548
549	if ($para eq "") {
550	# just remove all tags
551	$new_text =~ s/<[^>]*>/ /gs;
552	} else {
553	# strip all tags except <p> tags which get turned into $para
554	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
555	}
556
557	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
558	# may not be valid XML (eg. if HTML-only entities like   are used)
559	$new_text =~ s/&\w{1,10};//g;
560	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
561	$new_text =~ s/&([^\#])/ $1/g;
562
563	return $new_text;
564	}
565
566
567	1;
568
569

Note: See TracBrowser for help on using the repository browser.

Download in other formats: