Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 17740

Last change on this file since 17740 was 17568, checked in by kjdon, 16 years ago
recoding of the text method. more closely matches mgpp one. ZZ field only contains stuff that is already indexed, not all metadata. metadata will not reindex metadata that is already indexed. indexfieldmap only contains entries for things that have actually been indexed
Property svn:keywords set to `Author Date Id Revision`
File size: 16.1 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85	my $levels = $self->{'levels'};
86	my $ldoc_level = $levels->{'document'};
87	my $lsec_level = $levels->{'section'};
88	#my $lpar_level = $levels->{'paragraph'};
89
90	my $gs2_id = "";
91	if ($ldoc_level)
92	{
93	if ($self->{'db_level'} eq 'document')
94	{
95	$gs2_id = $self->{'num_docs'};
96	}
97	else
98	{
99	# default is section level
100	$gs2_id = $self->{'num_sections'} + 1;
101	}
102	}
103	my $gs2_docOID = $doc_obj->get_OID();
104	my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105	my $documentendtag = "\n</$doc_tag_name>\n";
106
107	my $sec_tag_name = "";
108	if ($lsec_level)
109	{
110	$sec_tag_name = $mgppbuildproc::level_map{'section'};
111	}
112
113	my $doc_section = 0; # just for this document
114
115	my $text = "";
116	$text .= $documenttag;
117	# get the text for this document
118	my $section = $doc_obj->get_top_section();
119	while (defined $section)
120	{
121	# update a few statistics
122	$doc_section++;
123	$self->{'num_sections'}++;
124
125	if ($sec_tag_name ne "")
126	{
127	my $sec_gs2_id = $self->{'num_sections'};
128	my $sec_gs2_docOID = $gs2_docOID . "." . $section;
129	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
130	}
131
132	# if we are doing subcollections, then some docs shouldn't be indexed.
133	# but we need to put the section tag placeholders in there so the
134	# sections match up with database
135	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
136	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
137	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
138	$section = $doc_obj->get_next_section($section);
139	next;
140	}
141
142	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
143
144	# has the user added a 'metadata' index?
145	my $all_metadata_specified = 0;
146	# which fields have already been indexed? (same as fields, but in a map)
147	my $specified_fields = {};
148
149	# do we have an allfields index??
150	my $allfields_index = 0;
151	# collect up all the text for it in here
152	my $allfields_text = "";
153	foreach my $field (split (/;/, $fields)) {
154	if ($field eq "allfields") {
155	$allfields_index = 1;
156	} elsif ($field eq "metadata") {
157	$all_metadata_specified = 1;
158	}
159	}
160
161	foreach my $field (split (/;/, $fields)) {
162
163	# only deal with this field if it doesn't start with top or
164	# this is the first section
165	my $real_field = $field;
166	next if (($real_field =~ s/^top//) && ($doc_section != 1));
167
168	# process these two later
169	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
170
171	#individual metadata and or text specified - could be a comma separated list
172	$specified_fields->{$real_field} = 1;
173	my $shortname="";
174	my $new_field = 0; # have we found a new field name?
175	if (defined $self->{'indexfieldmap'}->{$real_field}) {
176	$shortname = $self->{'indexfieldmap'}->{$real_field};
177	}
178	else {
179	$shortname = $self->create_shortname($real_field);
180	$new_field = 1;
181	}
182
183	my @metadata_list = (); # put any metadata values in here
184	my $section_text = ""; # put the text in here
185	foreach my $submeta (split /,/, $real_field) {
186	if ($submeta eq "text") {
187	# no point in indexing text more than once
188	if ($section_text eq "") {
189	$section_text = $doc_obj->get_text($section);
190	if ($self->{'indexing_text'}) {
191	# we always strip html
192	$section_text = $self->preprocess_text($section_text, 1, "");
193	}
194	else {
195	# leave html stuff in, but escape the tags
196	&ghtml::htmlsafe($section_text);
197	}
198	}
199	}
200	else {
201	# its a metadata element
202	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
203	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
204	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
205	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
206	}
207	}
208	push (@metadata_list, @section_metadata);
209	}
210	} # for each field in this one index
211
212	# now we add the text and/or metadata into new_text
213	if ($section_text ne "" \|\| scalar(@metadata_list)) {
214	my $new_text = "";
215
216	if ($section_text ne "") {
217	$new_text .= "$section_text ";
218	}
219
220	foreach my $item (@metadata_list) {
221	&ghtml::htmlsafe($item);
222	$new_text .= "$item ";
223	}
224
225	if ($allfields_index) {
226	$allfields_text .= $new_text;
227	}
228
229	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
230	# filter the text
231	$new_text = $self->filter_text ($field, $new_text);
232	$self->{'num_processed_bytes'} += length ($new_text);
233
234	$text .= "$new_text";
235
236	if ($new_field) {
237	# we need to add to the list in indexfields
238
239	$self->{'indexfieldmap'}->{$real_field} = $shortname;
240	$self->{'indexfieldmap'}->{$shortname} = 1;
241	}
242
243	}
244
245	} # foreach field
246
247
248	if ($all_metadata_specified) {
249
250	my $new_text = "";
251	my $shortname = "";
252	my $metadata = $doc_obj->get_all_metadata ($section);
253	foreach my $pair (@$metadata) {
254	my ($mfield, $mvalue) = (@$pair);
255	# no value
256	next unless defined $mvalue && $mvalue ne "";
257	# we have already indexed this
258	next if defined ($specified_fields->{$mfield});
259	# check fields here, maybe others dont want - change to use dontindex!!
260	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
261	next if ($mfield =~ /^gsdl/);
262
263	&ghtml::htmlsafe($mvalue);
264
265	if (defined $self->{'indexfieldmap'}->{$mfield}) {
266	$shortname = $self->{'indexfieldmap'}->{$mfield};
267	}
268	else {
269	$shortname = $self->create_shortname($mfield);
270	$self->{'indexfieldmap'}->{$mfield} = $shortname;
271	$self->{'indexfieldmap'}->{$shortname} = 1;
272	}
273	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
274	if ($allfields_index) {
275	$allfields_text .= "$mvalue ";
276	}
277
278	if (!defined $self->{'indexfields'}->{$mfield}) {
279	$self->{'indexfields'}->{$mfield} = 1;
280	}
281
282	}
283	# filter the text
284	$new_text = $self->filter_text ("metadata", $new_text);
285
286	$self->{'num_processed_bytes'} += length ($new_text);
287	$text .= "$new_text";
288
289
290	}
291
292	if ($allfields_index) {
293	# add the index name mapping
294	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
295	$self->{'indexfieldmap'}->{"ZZ"} = 1;
296
297	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
298	# filter the text
299	$new_text = $self->filter_text ("allfields", $new_text);
300
301	$self->{'num_processed_bytes'} += length ($new_text);
302	$text .= "$new_text";
303	}
304
305	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
306
307	$section = $doc_obj->get_next_section($section);
308	} #while defined section
309	print $handle "$text\n$documentendtag";
310	#print STDOUT "$text\n$documentendtag";
311	}
312
313	# /** We make this builder pretend to be a document processor so we can get
314	# * information back from the plugins.
315	# *
316	# * @param $self A reference to this Lucene builder
317	# * @param $doc_obj A reference to a document object representing what was
318	# * parsed by the GAPlug
319	# * @param $file The name of the file parsed as a string
320	# *
321	# * @author John Thompson, DL Consulting Ltd
322	# */
323	sub process()
324	{
325	my $self = shift (@_);
326	my ($doc_obj, $file) = @_;
327
328	# If this is called from any stage other than an incremental infodb we want
329	# to pass through to the superclass of build
330	if ($self->get_mode() eq "incinfodb")
331	{
332	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
333	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
334	$archivedir = "" unless defined $archivedir;
335	$archivedir =~ s/\\/\//g;
336	$archivedir =~ s/^\/+//;
337	$archivedir =~ s/\/+$//;
338
339	# Number of files
340	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
341
342	# resolve the final filenames of the files associated with this document
343	$self->assoc_files ($doc_obj, $archivedir);
344
345	# is this a paged or a hierarchical document
346	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
347
348	# Determine the actual docnum by checking if we've processed any
349	# previous incrementally added documents. If so, carry on from there.
350	# Otherwise we set the counter to be the same as the number of
351	# sections encountered during the previous build
352	if ($self->{'numincdocs'} == 0)
353	{
354	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
355	}
356
357	my $section = $doc_obj->get_top_section ();
358	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
359	my $doc_OID = $doc_obj->get_OID();
360	my $url = "";
361	while (defined $section)
362	{
363	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
364	# Attach all the other metadata to this document
365	# output the fact that this document is a document (unless doctype
366	# has been set to something else from within a plugin
367	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
368	if (!defined $dtype \|\| $dtype !~ /\w/)
369	{
370	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
371	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
372	}
373	# output whether this node contains text
374	if ($doc_obj->get_text_length($section) > 0)
375	{
376	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
377	}
378	else
379	{
380	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
381	}
382
383	# output archivedir if at top level
384	if ($section eq $doc_obj->get_top_section())
385	{
386	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
387	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
388	}
389
390	# output a list of children
391	my $children = $doc_obj->get_children ($section);
392	if (scalar(@$children) > 0)
393	{
394	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
395	my @contains = ();
396	foreach my $child (@$children)
397	{
398	if ($child =~ /^.*?\.(\d+)$/)
399	{
400	push (@contains, "\".$1");
401	}
402	else
403	{
404	push (@contains, "\".$child");
405	}
406	}
407	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
408	}
409	#output the matching doc number
410	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
411	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
412
413	$self->{'numincdocs'}++;
414	$section = $doc_obj->get_next_section($section);
415	# if no sections wanted, only add the docs
416	last if ($self->{'db_level'} eq "document");
417	}
418	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
419	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
420	}
421	else
422	{
423	$self->mgppbuildproc::process(@_);
424	}
425	}
426	# / process() /
427
428
429	# Following methods seem to be no different to those defined in basebuildproc.pm
430	# From inspection, it looks like these ones can be removed
431
432
433	sub get_num_docs {
434	my $self = shift (@_);
435	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
436	return $self->{'num_docs'};
437	}
438
439	sub get_num_sections {
440	my $self = shift (@_);
441	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
442	return $self->{'num_sections'};
443	}
444
445	# num_bytes is the actual number of bytes in the collection
446	# this is normally the same as what's processed during text compression
447	sub get_num_bytes {
448	my $self = shift (@_);
449	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
450	return $self->{'num_bytes'};
451	}
452
453
454	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
455	# Otherwise the removal of tags below might lead to Lucene turning
456	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
457	# (example from demo collection b20cre)
458	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
459	sub preprocess_text
460	{
461	my $self = shift (@_);
462	my ($text, $strip_html, $para) = @_;
463	# at this stage, we do not do paragraph tags unless have strip_html -
464	# it will result in a huge mess of non-xml
465	return unless $strip_html;
466
467	my $new_text = $text;
468
469	# if we have <pre> tags, we can have < > inside them, need to delete
470	# the <> before stripping tags
471	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
472
473	if ($para eq "") {
474	# just remove all tags
475	$new_text =~ s/<[^>]*>/ /gs;
476	} else {
477	# strip all tags except <p> tags which get turned into $para
478	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
479	}
480
481	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
482	# may not be valid XML (eg. if HTML-only entities like   are used)
483	$new_text =~ s/&\w{1,10};//g;
484	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
485	$new_text =~ s/&([^\#])/ $1/g;
486
487	return $new_text;
488	}
489
490
491	1;
492

Note: See TracBrowser for help on using the repository browser.

Download in other formats: