Context Navigation

source: trunk/gsdl/perllib/lucenebuildproc.pm@ 12581

Last change on this file since 12581 was 12426, checked in by mdewsnip, 18 years ago
Deleted the code for removing entities, since it seemed to be negatively helpful (and done twice in many situations). When compressing the text, htmlsafe is called on the section text, so the XML will be valid in this case. When indexing the text, the HTML tags are stripped out ('strip_html' is always set for Lucene), so there is no problem in this case either.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	sub BEGIN {
40	@lucenebuildproc::ISA = ('mgppbuildproc');
41	}
42
43
44	sub new {
45	my $class = shift @_;
46	my $self = new mgppbuildproc (@_);
47
48	return bless $self, $class;
49	}
50
51
52	sub is_incremental_capable
53	{
54	my $self = shift (@_);
55
56	# Unlike MG and MGPP, Lucene supports incremental building
57	return 1;
58	}
59
60
61	sub text {
62	my $self = shift (@_);
63	my ($doc_obj,$file) = @_;
64	my $handle = $self->{'output_handle'};
65	my $outhandle = $self->{'outhandle'};
66
67	# only output this document if it is one to be indexed
68	return if ($doc_obj->get_doc_type() ne "indexed_doc");
69
70	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
71
72	# this is another document
73	$self->{'num_docs'} += 1;
74
75	# get the parameters for the output
76	# split on : just in case there is subcoll and lang stuff
77	my ($fields) = split (/:/, $self->{'index'});
78
79	my $doc_level = $mgppbuildproc::level_map{'document'};
80	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
81
82	my $levels = $self->{'levels'};
83	my $ldoc_level = $levels->{'document'};
84	my $lsec_level = $levels->{'section'};
85	my $lpar_level = $levels->{'paragraph'};
86
87	my $docid="";
88	if ($ldoc_level) {
89	if ($self->{'gdbm_level'} eq 'document') {
90	my $doc_sec_num = $self->{'num_docs'};
91	$docid = "gs2:id=\"$doc_sec_num\"";
92	} else {
93	# default is section level
94	my $doc_sec_num = $self->{'num_sections'}+1;
95	$docid = "gs2:id=\"$doc_sec_num\"";
96	}
97	}
98	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
99	my $documentendtag = "\n</$doc_level>\n";
100
101	my ($sectiontag) = "";
102	if ($lsec_level) {
103	$sectiontag = $mgppbuildproc::level_map{'section'};
104	}
105	my ($parastarttag) = "";
106	my ($paraendtag) = "";
107	if ($self->{'levels'}->{'paragraph'}) {
108	if ($self->{'strip_html'}) {
109	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
110	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
111	} else {
112	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
113	}
114	}
115
116	my $doc_section = 0; # just for this document
117
118	my $text = "";
119	$text .= $documenttag;
120	# get the text for this document
121	my $section = $doc_obj->get_top_section();
122	while (defined $section) {
123	# update a few statistics
124	$doc_section++;
125	$self->{'num_sections'} += 1;
126
127	if ($sectiontag ne "") {
128	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
129	$text .= "\n<$sectiontag $secid >\n";
130	}
131
132	# if we are doing subcollections, then some docs shouldn't be indexed.
133	# but we need to put the section tag placeholders in there so the
134	# sections match up with gdbm db
135	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
136	if (!$indexed_doc \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
137	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
138	$section = $doc_obj->get_next_section($section);
139	next;
140	}
141
142	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
143	foreach my $field (split (/;/, $fields)) {
144	# only deal with this field if it doesn't start with top or
145	# this is the first section
146	my $real_field = $field;
147	next if (($real_field =~ s/^top//) && ($doc_section != 1));
148
149	my $new_text = "";
150
151	# we get allfields by default - do nothing
152	if ($real_field eq "allfields") {
153
154	}
155	# metadata - output all metadata we know about except gsdl stuff
156	elsif ($real_field eq "metadata") {
157	my $shortname = "";
158	my $metadata = $doc_obj->get_all_metadata ($section);
159	foreach my $pair (@$metadata) {
160	my ($mfield, $mvalue) = (@$pair);
161	# check fields here, maybe others dont want - change to use dontindex!!
162	if ($mfield ne "Identifier"
163	&& $mfield !~ /^gsdl/
164	&& $mfield ne "classifytype"
165	&& $mfield ne "assocfilepath"
166	&& defined $mvalue && $mvalue ne "") {
167
168	if (defined $self->{'indexfieldmap'}->{$mfield}) {
169	$shortname = $self->{'indexfieldmap'}->{$mfield};
170	}
171	else {
172	$shortname = $self->create_shortname($mfield);
173	$self->{'indexfieldmap'}->{$mfield} = $shortname;
174	$self->{'indexfieldmap'}->{$shortname} = 1;
175	}
176	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
177	if (!defined $self->{'indexfields'}->{$mfield}) {
178	$self->{'indexfields'}->{$mfield} = 1;
179	}
180	}
181	}
182	}
183	else {
184	#individual metadata and or text specified - could be a comma separated list
185	my $shortname="";
186	if (defined $self->{'indexfieldmap'}->{$real_field}) {
187	$shortname = $self->{'indexfieldmap'}->{$real_field};
188	}
189	else {
190	$shortname = $self->create_shortname($real_field);
191	$self->{'indexfieldmap'}->{$real_field} = $shortname;
192	$self->{'indexfieldmap'}->{$shortname} = 1;
193	}
194
195	my @metadata_list = ();
196	foreach my $submeta (split /,/, $real_field) {
197	if ($submeta eq "text") {
198	my $section_text = $doc_obj->get_text($section);
199	if ($self->{'indexing_text'}) {
200	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
201	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
202	if ($parastarttag ne "") {
203	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
204	}
205	else {
206	# we don't want to individually tag each paragraph if not doing para indexing
207	$section_text = $self->preprocess_text($section_text, 1, "");
208	}
209	$new_text .= "$section_text</$shortname>$paraendtag\n";
210	}
211	else {
212	# leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment
213	&ghtml::htmlsafe($section_text);
214	$new_text .= $section_text;
215	}
216	}
217	else {
218	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
219	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
220	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
221	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
222	}
223	}
224	push (@metadata_list, @section_metadata);
225	}
226	}
227	foreach my $item (@metadata_list) {
228	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
229	}
230	}
231
232	# filter the text
233	$self->filter_text ($field, $new_text);
234	$self->{'num_processed_bytes'} += length ($new_text);
235	$text .= "$new_text";
236	} # foreach field
237
238	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
239
240	$section = $doc_obj->get_next_section($section);
241	} #while defined section
242	print $handle "$text\n$documentendtag";
243	#print STDOUT "$text\n$documentendtag";
244	}
245
246	1;
247

Note: See TracBrowser for help on using the repository browser.

Download in other formats: