Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 16300

Last change on this file since 16300 was 16300, checked in by mdewsnip, 16 years ago
Fixed another case where '<' and '>' characters in metadata weren't being escaped when producing the XML for indexing.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_level = $mgppbuildproc::level_map{'document'};
84	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
85
86	my $levels = $self->{'levels'};
87	my $ldoc_level = $levels->{'document'};
88	my $lsec_level = $levels->{'section'};
89	my $lpar_level = $levels->{'paragraph'};
90
91	my $docid="";
92	if ($ldoc_level)
93	{
94	if ($self->{'db_level'} eq 'document')
95	{
96	my $doc_sec_num = $self->{'num_docs'};
97	$docid = "gs2:id=\"$doc_sec_num\"";
98	}
99	else
100	{
101	# default is section level
102	my $doc_sec_num = $self->{'num_sections'} + 1;
103	$docid = "gs2:id=\"$doc_sec_num\"";
104	}
105	}
106	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
107	my $documentendtag = "\n</$doc_level>\n";
108
109	my ($sectiontag) = "";
110	if ($lsec_level)
111	{
112	$sectiontag = $mgppbuildproc::level_map{'section'};
113	}
114	my ($parastarttag) = "";
115	my ($paraendtag) = "";
116	if ($self->{'levels'}->{'paragraph'})
117	{
118	if ($self->{'strip_html'})
119	{
120	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
121	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
122	}
123	else
124	{
125	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126	}
127	}
128
129	my $doc_section = 0; # just for this document
130
131	my $text = "";
132	$text .= $documenttag;
133	# get the text for this document
134	my $section = $doc_obj->get_top_section();
135	while (defined $section)
136	{
137	# update a few statistics
138	$doc_section++;
139	$self->{'num_sections'}++;
140
141	if ($sectiontag ne "")
142	{
143	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
144	$text .= "\n<$sectiontag $secid >\n";
145	}
146
147	# if we are doing subcollections, then some docs shouldn't be indexed.
148	# but we need to put the section tag placeholders in there so the
149	# sections match up with database
150	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
151	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
152	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
153	$section = $doc_obj->get_next_section($section);
154	next;
155	}
156
157	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
158	foreach my $field (split (/;/, $fields))
159	{
160	# only deal with this field if it doesn't start with top or
161	# this is the first section
162	my $real_field = $field;
163	next if (($real_field =~ s/^top//) && ($doc_section != 1));
164
165	my $new_text = "";
166	my $tmp_text = "";
167
168	# If allfields is requested add all metadata fields and text as
169	# belonging to the ZZ field
170	if ($real_field eq "allfields") {
171	# Text first - no html nor paragraph tags
172	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
173	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
174	&ghtml::htmlsafe($tmp_text);
175	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
176	# Then Metadata
177	my $metadata = $doc_obj->get_all_metadata ($section);
178	foreach my $pair (@$metadata) {
179	my ($mfield, $mvalue) = (@$pair);
180	&ghtml::htmlsafe($mvalue);
181	# check fields here, maybe others dont want - change to use dontindex!!
182	if ($mfield ne "Identifier"
183	&& $mfield !~ /^gsdl/
184	&& $mfield ne "classifytype"
185	&& $mfield ne "assocfilepath"
186	&& defined $mvalue && $mvalue ne "") {
187	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
188	}
189	if (!defined $self->{'indexfields'}->{$mfield}) {
190	$self->{'indexfields'}->{$mfield} = 1;
191	}
192	}
193	}
194	# metadata - output all metadata we know about except gsdl stuff
195	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
196	my $shortname = "";
197	my $metadata = $doc_obj->get_all_metadata ($section);
198	foreach my $pair (@$metadata) {
199	my ($mfield, $mvalue) = (@$pair);
200	&ghtml::htmlsafe($mvalue);
201	# check fields here, maybe others dont want - change to use dontindex!!
202	if ($mfield ne "Identifier"
203	&& $mfield !~ /^gsdl/
204	&& $mfield ne "classifytype"
205	&& $mfield ne "assocfilepath"
206	&& defined $mvalue && $mvalue ne "") {
207
208	if (defined $self->{'indexfieldmap'}->{$mfield}) {
209	$shortname = $self->{'indexfieldmap'}->{$mfield};
210	}
211	else {
212	$shortname = $self->create_shortname($mfield);
213	$self->{'indexfieldmap'}->{$mfield} = $shortname;
214	$self->{'indexfieldmap'}->{$shortname} = 1;
215	}
216	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
217	if (!defined $self->{'indexfields'}->{$mfield}) {
218	$self->{'indexfields'}->{$mfield} = 1;
219	}
220	}
221	}
222	}
223	else {
224	#individual metadata and or text specified - could be a comma separated list
225	my $shortname="";
226	if (defined $self->{'indexfieldmap'}->{$real_field}) {
227	$shortname = $self->{'indexfieldmap'}->{$real_field};
228	}
229	else {
230	$shortname = $self->create_shortname($real_field);
231	$self->{'indexfieldmap'}->{$real_field} = $shortname;
232	$self->{'indexfieldmap'}->{$shortname} = 1;
233	}
234
235	my @metadata_list = ();
236	foreach my $submeta (split /,/, $real_field) {
237	if ($submeta eq "text") {
238	my $section_text = $doc_obj->get_text($section);
239	if ($self->{'indexing_text'}) {
240	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
241	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
242	if ($parastarttag ne "") {
243	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
244	}
245	else {
246	# we don't want to individually tag each paragraph if not doing para indexing
247	$section_text = $self->preprocess_text($section_text, 1, "");
248	}
249	$new_text .= "$section_text</$shortname>$paraendtag\n";
250	}
251	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
252	$tmp_text .= $doc_obj->get_text ($section);
253	&ghtml::htmlsafe($tmp_text);
254	$new_text .= $tmp_text;
255	}
256	}
257	else {
258	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
259	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
260	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
261	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
262	}
263	}
264	push (@metadata_list, @section_metadata);
265	}
266	}
267	foreach my $item (@metadata_list) {
268	&ghtml::htmlsafe($item);
269	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
270	}
271	}
272	# filter the text
273	$self->filter_text ($field, $new_text);
274	$self->{'num_processed_bytes'} += length ($new_text);
275
276	$text .= "$new_text";
277	} # foreach field
278
279	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
280
281	$section = $doc_obj->get_next_section($section);
282	} #while defined section
283	print $handle "$text\n$documentendtag";
284	#print STDOUT "$text\n$documentendtag";
285	}
286
287	# /** We make this builder pretend to be a document processor so we can get
288	# * information back from the plugins.
289	# *
290	# * @param $self A reference to this Lucene builder
291	# * @param $doc_obj A reference to a document object representing what was
292	# * parsed by the GAPlug
293	# * @param $file The name of the file parsed as a string
294	# *
295	# * @author John Thompson, DL Consulting Ltd
296	# */
297	sub process()
298	{
299	my $self = shift (@_);
300	my ($doc_obj, $file) = @_;
301
302	# If this is called from any stage other than an incremental infodb we want
303	# to pass through to the superclass of build
304	if ($self->get_mode() eq "incinfodb")
305	{
306	print STDERR "* Processing a document added using INCINFODB *\n";
307	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
308	$archivedir = "" unless defined $archivedir;
309	$archivedir =~ s/\\/\//g;
310	$archivedir =~ s/^\/+//;
311	$archivedir =~ s/\/+$//;
312
313	# Number of files
314	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
315
316	# resolve the final filenames of the files associated with this document
317	$self->assoc_files ($doc_obj, $archivedir);
318
319	# is this a paged or a hierarchical document
320	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
321
322	# Determine the actual docnum by checking if we've processed any
323	# previous incrementally added documents. If so, carry on from there.
324	# Otherwise we set the counter to be the same as the number of
325	# sections encountered during the previous build
326	if ($self->{'numincdocs'} == 0)
327	{
328	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
329	}
330
331	my $section = $doc_obj->get_top_section ();
332	print STDERR "+ top section: '$section'\n";
333	my $doc_OID = $doc_obj->get_OID();
334	my $url = "";
335	while (defined $section)
336	{
337	print STDERR "+ processing section: '$section'\n";
338	# Attach all the other metadata to this document
339	# output the fact that this document is a document (unless doctype
340	# has been set to something else from within a plugin
341	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
342	if (!defined $dtype \|\| $dtype !~ /\w/)
343	{
344	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
345	}
346	# output whether this node contains text
347	if ($doc_obj->get_text_length($section) > 0)
348	{
349	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
350	}
351	else
352	{
353	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
354	}
355
356	# output archivedir if at top level
357	if ($section eq $doc_obj->get_top_section())
358	{
359	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
360	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
361	}
362
363	# output a list of children
364	my $children = $doc_obj->get_children ($section);
365	if (scalar(@$children) > 0)
366	{
367	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
368	my @contains = ();
369	foreach my $child (@$children)
370	{
371	if ($child =~ /^.*?\.(\d+)$/)
372	{
373	push (@contains, "\".$1");
374	}
375	else
376	{
377	push (@contains, "\".$child");
378	}
379	}
380	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
381	}
382	#output the matching doc number
383	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
384	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
385
386	$self->{'numincdocs'}++;
387	$section = $doc_obj->get_next_section($section);
388	# if no sections wanted, only add the docs
389	last if ($self->{'db_level'} eq "document");
390	}
391	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
392	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
393	}
394	else
395	{
396	$self->mgppbuildproc::process(@_);
397	}
398	}
399	# / process() /
400
401
402	# Following methods seem to be no different to those defined in basebuildproc.pm
403	# From inspection, it looks like these ones can be removed
404
405
406	sub get_num_docs {
407	my $self = shift (@_);
408	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
409	return $self->{'num_docs'};
410	}
411
412	sub get_num_sections {
413	my $self = shift (@_);
414	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
415	return $self->{'num_sections'};
416	}
417
418	# num_bytes is the actual number of bytes in the collection
419	# this is normally the same as what's processed during text compression
420	sub get_num_bytes {
421	my $self = shift (@_);
422	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
423	return $self->{'num_bytes'};
424	}
425
426
427	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
428	# Otherwise the removal of tags below might lead to Lucene turning
429	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
430	# (example from demo collection b20cre)
431	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
432	sub preprocess_text
433	{
434	my $self = shift (@_);
435	my ($text, $strip_html, $para) = @_;
436	# at this stage, we do not do paragraph tags unless have strip_html -
437	# it will result in a huge mess of non-xml
438	return unless $strip_html;
439
440	my $new_text = $text;
441
442	# if we have <pre> tags, we can have < > inside them, need to delete
443	# the <> before stripping tags
444	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
445
446	if ($para eq "") {
447	# just remove all tags
448	$new_text =~ s/<[^>]*>/ /gs;
449	} else {
450	# strip all tags except <p> tags which get turned into $para
451	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
452	}
453
454	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
455	# may not be valid XML (eg. if HTML-only entities like   are used)
456	$new_text =~ s/&\w{1,10};//g;
457	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
458	$new_text =~ s/&([^\#])/ $1/g;
459
460	return $new_text;
461	}
462
463
464	1;
465

Note: See TracBrowser for help on using the repository browser.

Download in other formats: