Context Navigation

source: trunk/gsdl/perllib/lucenebuildproc.pm@ 12401

Last change on this file since 12401 was 12371, checked in by mdewsnip, 18 years ago
If sections_index_document_metadata is on, top level sections no longer inherit metadata from themselves (!?).
Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35
36	sub BEGIN {
37	@lucenebuildproc::ISA = ('mgppbuildproc');
38	}
39
40
41	sub new {
42	my $class = shift @_;
43	my $self = new mgppbuildproc (@_);
44
45	return bless $self, $class;
46	}
47
48
49	sub is_incremental_capable
50	{
51	my $self = shift (@_);
52
53	# Unlike MG and MGPP, Lucene supports incremental building
54	return 1;
55	}
56
57
58	sub preprocess_text {
59	my $self = shift (@_);
60	my ($text, $strip_html, $para) = @_;
61
62	# call the mgpp method first
63	my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);
64
65	# remove entities
66	$new_text =~ s/&\w{1,10};//g;
67	# remove &
68	$new_text =~ s/&//g;
69
70	return $new_text;
71	}
72
73
74	sub text {
75	my $self = shift (@_);
76	my ($doc_obj,$file) = @_;
77	my $handle = $self->{'output_handle'};
78	my $outhandle = $self->{'outhandle'};
79
80	# only output this document if it is one to be indexed
81	return if ($doc_obj->get_doc_type() ne "indexed_doc");
82
83	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
84
85	# this is another document
86	$self->{'num_docs'} += 1;
87
88	# get the parameters for the output
89	# split on : just in case there is subcoll and lang stuff
90	my ($fields) = split (/:/, $self->{'index'});
91
92	my $doc_level = $mgppbuildproc::level_map{'document'};
93	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
94
95	my $levels = $self->{'levels'};
96	my $ldoc_level = $levels->{'document'};
97	my $lsec_level = $levels->{'section'};
98	my $lpar_level = $levels->{'paragraph'};
99
100	my $docid="";
101	if ($ldoc_level) {
102	if ($self->{'gdbm_level'} eq 'document') {
103	my $doc_sec_num = $self->{'num_docs'};
104	$docid = "gs2:id=\"$doc_sec_num\"";
105	} else {
106	# default is section level
107	my $doc_sec_num = $self->{'num_sections'}+1;
108	$docid = "gs2:id=\"$doc_sec_num\"";
109	}
110	}
111	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
112	my $documentendtag = "\n</$doc_level>\n";
113
114	my ($sectiontag) = "";
115	if ($lsec_level) {
116	$sectiontag = $mgppbuildproc::level_map{'section'};
117	}
118	my ($parastarttag) = "";
119	my ($paraendtag) = "";
120	if ($self->{'levels'}->{'paragraph'}) {
121	if ($self->{'strip_html'}) {
122	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
123	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
124	} else {
125	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126	}
127	}
128
129	my $doc_section = 0; # just for this document
130
131	my $text = "";
132	$text .= $documenttag;
133	# get the text for this document
134	my $section = $doc_obj->get_top_section();
135	while (defined $section) {
136	# update a few statistics
137	$doc_section++;
138	$self->{'num_sections'} += 1;
139
140	if ($sectiontag ne "") {
141	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
142	$text .= "\n<$sectiontag $secid >\n";
143	}
144
145	# if we are doing subcollections, then some docs shouldn't be indexed.
146	# but we need to put the section tag placeholders in there so the
147	# sections match up with gdbm db
148	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
149	if (!$indexed_doc \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
150	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
151	$section = $doc_obj->get_next_section($section);
152	next;
153	}
154
155	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
156	foreach my $field (split (/;/, $fields)) {
157	# only deal with this field if it doesn't start with top or
158	# this is the first section
159	my $real_field = $field;
160	next if (($real_field =~ s/^top//) && ($doc_section != 1));
161
162	my $new_text = "";
163	my $tmp_text = "";
164
165	# we get allfields by default - do nothing
166	if ($real_field eq "allfields") {
167
168	}
169	# metadata - output all metadata we know about except gsdl stuff
170	elsif ($real_field eq "metadata") {
171	my $shortname = "";
172	my $metadata = $doc_obj->get_all_metadata ($section);
173	foreach $pair (@$metadata) {
174	my ($mfield, $mvalue) = (@$pair);
175	# check fields here, maybe others dont want - change to use dontindex!!
176	if ($mfield ne "Identifier"
177	&& $mfield !~ /^gsdl/
178	&& $mfield ne "classifytype"
179	&& $mfield ne "assocfilepath"
180	&& defined $mvalue && $mvalue ne "") {
181
182	if (defined $self->{'indexfieldmap'}->{$mfield}) {
183	$shortname = $self->{'indexfieldmap'}->{$mfield};
184	}
185	else {
186	$shortname = $self->create_shortname($mfield);
187	$self->{'indexfieldmap'}->{$mfield} = $shortname;
188	$self->{'indexfieldmap'}->{$shortname} = 1;
189	}
190	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
191	if (!defined $self->{'indexfields'}->{$mfield}) {
192	$self->{'indexfields'}->{$mfield} = 1;
193	}
194	}
195	}
196
197	}
198	else {
199	#individual metadata and or text specified - could be a comma separated list
200	my $shortname="";
201	if (defined $self->{'indexfieldmap'}->{$real_field}) {
202	$shortname = $self->{'indexfieldmap'}->{$real_field};
203	}
204	else {
205	$shortname = $self->create_shortname($real_field);
206	$self->{'indexfieldmap'}->{$real_field} = $shortname;
207	$self->{'indexfieldmap'}->{$shortname} = 1;
208	}
209
210	my @metadata_list = ();
211	foreach $submeta (split /,/, $real_field) {
212	if ($submeta eq "text") {
213	if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
214	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
215	$tmp_text .= $doc_obj->get_text ($section);
216	if ($parastarttag ne "") {
217	$tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
218	} else {
219	# we don't want to individually tag each paragraph if not doing para indexing
220	$tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
221	}
222	$new_text .= "$tmp_text</$shortname>$paraendtag\n";
223	}
224	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
225	$tmp_text .= $doc_obj->get_text ($section);
226	&ghtml::htmlsafe($tmp_text);
227	$new_text .= $tmp_text;
228
229	}
230	}
231	else {
232	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
233	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
234	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
235	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
236	}
237	}
238	push (@metadata_list, @section_metadata);
239	}
240	}
241	foreach my $item (@metadata_list) {
242	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
243	}
244
245	# remove entities
246	$new_text =~ s/&\w{1,10};//g;
247	# remove &
248	$new_text =~ s/&//g;
249	}
250
251	# filter the text
252	$self->filter_text ($field, $new_text);
253	$self->{'num_processed_bytes'} += length ($new_text);
254	$text .= "$new_text";
255	} # foreach field
256
257	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
258
259	$section = $doc_obj->get_next_section($section);
260	} #while defined section
261	print $handle "$text\n$documentendtag";
262	#print STDOUT "$text\n$documentendtag";
263	}
264
265	1;
266

Note: See TracBrowser for help on using the repository browser.

Download in other formats: