Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 16506

Last change on this file since 16506 was 16506, checked in by mdewsnip, 16 years ago
Now adds gs2:docOID attributes into "<Sec>" tags as well, to prevent errors when indexing at section level.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85	my $levels = $self->{'levels'};
86	my $ldoc_level = $levels->{'document'};
87	my $lsec_level = $levels->{'section'};
88	my $lpar_level = $levels->{'paragraph'};
89
90	my $gs2_id = "";
91	if ($ldoc_level)
92	{
93	if ($self->{'db_level'} eq 'document')
94	{
95	$gs2_id = $self->{'num_docs'};
96	}
97	else
98	{
99	# default is section level
100	$gs2_id = $self->{'num_sections'} + 1;
101	}
102	}
103	my $gs2_docOID = $doc_obj->get_OID();
104	my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105	my $documentendtag = "\n</$doc_tag_name>\n";
106
107	my $sec_tag_name = "";
108	if ($lsec_level)
109	{
110	$sec_tag_name = $mgppbuildproc::level_map{'section'};
111	}
112	my ($parastarttag) = "";
113	my ($paraendtag) = "";
114	if ($self->{'levels'}->{'paragraph'})
115	{
116	if ($self->{'strip_html'})
117	{
118	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120	}
121	else
122	{
123	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124	}
125	}
126
127	my $doc_section = 0; # just for this document
128
129	my $text = "";
130	$text .= $documenttag;
131	# get the text for this document
132	my $section = $doc_obj->get_top_section();
133	while (defined $section)
134	{
135	# update a few statistics
136	$doc_section++;
137	$self->{'num_sections'}++;
138
139	if ($sec_tag_name ne "")
140	{
141	my $sec_gs2_id = $self->{'num_sections'};
142	my $sec_gs2_docOID = $gs2_docOID . "." . $section;
143	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
144	}
145
146	# if we are doing subcollections, then some docs shouldn't be indexed.
147	# but we need to put the section tag placeholders in there so the
148	# sections match up with database
149	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
150	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
151	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
152	$section = $doc_obj->get_next_section($section);
153	next;
154	}
155
156	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
157	foreach my $field (split (/;/, $fields))
158	{
159	# only deal with this field if it doesn't start with top or
160	# this is the first section
161	my $real_field = $field;
162	next if (($real_field =~ s/^top//) && ($doc_section != 1));
163
164	my $new_text = "";
165	my $tmp_text = "";
166
167	# If allfields is requested add all metadata fields and text as
168	# belonging to the ZZ field
169	if ($real_field eq "allfields") {
170	# Text first - no html nor paragraph tags
171	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
172	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
173	&ghtml::htmlsafe($tmp_text);
174	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
175	# Then Metadata
176	my $metadata = $doc_obj->get_all_metadata ($section);
177	foreach my $pair (@$metadata) {
178	my ($mfield, $mvalue) = (@$pair);
179	&ghtml::htmlsafe($mvalue);
180	# check fields here, maybe others dont want - change to use dontindex!!
181	if ($mfield ne "Identifier"
182	&& $mfield !~ /^gsdl/
183	&& $mfield ne "classifytype"
184	&& $mfield ne "assocfilepath"
185	&& defined $mvalue && $mvalue ne "") {
186	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
187	}
188	if (!defined $self->{'indexfields'}->{$mfield}) {
189	$self->{'indexfields'}->{$mfield} = 1;
190	}
191	}
192	}
193	# metadata - output all metadata we know about except gsdl stuff
194	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
195	my $shortname = "";
196	my $metadata = $doc_obj->get_all_metadata ($section);
197	foreach my $pair (@$metadata) {
198	my ($mfield, $mvalue) = (@$pair);
199	&ghtml::htmlsafe($mvalue);
200	# check fields here, maybe others dont want - change to use dontindex!!
201	if ($mfield ne "Identifier"
202	&& $mfield !~ /^gsdl/
203	&& $mfield ne "classifytype"
204	&& $mfield ne "assocfilepath"
205	&& defined $mvalue && $mvalue ne "") {
206
207	if (defined $self->{'indexfieldmap'}->{$mfield}) {
208	$shortname = $self->{'indexfieldmap'}->{$mfield};
209	}
210	else {
211	$shortname = $self->create_shortname($mfield);
212	$self->{'indexfieldmap'}->{$mfield} = $shortname;
213	$self->{'indexfieldmap'}->{$shortname} = 1;
214	}
215	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
216	if (!defined $self->{'indexfields'}->{$mfield}) {
217	$self->{'indexfields'}->{$mfield} = 1;
218	}
219	}
220	}
221	}
222	else {
223	#individual metadata and or text specified - could be a comma separated list
224	my $shortname="";
225	if (defined $self->{'indexfieldmap'}->{$real_field}) {
226	$shortname = $self->{'indexfieldmap'}->{$real_field};
227	}
228	else {
229	$shortname = $self->create_shortname($real_field);
230	$self->{'indexfieldmap'}->{$real_field} = $shortname;
231	$self->{'indexfieldmap'}->{$shortname} = 1;
232	}
233
234	my @metadata_list = ();
235	foreach my $submeta (split /,/, $real_field) {
236	if ($submeta eq "text") {
237	my $section_text = $doc_obj->get_text($section);
238	if ($self->{'indexing_text'}) {
239	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
240	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
241	if ($parastarttag ne "") {
242	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
243	}
244	else {
245	# we don't want to individually tag each paragraph if not doing para indexing
246	$section_text = $self->preprocess_text($section_text, 1, "");
247	}
248	$new_text .= "$section_text</$shortname>$paraendtag\n";
249	}
250	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251	$tmp_text .= $doc_obj->get_text ($section);
252	&ghtml::htmlsafe($tmp_text);
253	$new_text .= $tmp_text;
254	}
255	}
256	else {
257	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
258	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
259	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
260	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
261	}
262	}
263	push (@metadata_list, @section_metadata);
264	}
265	}
266	foreach my $item (@metadata_list) {
267	&ghtml::htmlsafe($item);
268	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269	}
270	}
271	# filter the text
272	$self->filter_text ($field, $new_text);
273	$self->{'num_processed_bytes'} += length ($new_text);
274
275	$text .= "$new_text";
276	} # foreach field
277
278	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
279
280	$section = $doc_obj->get_next_section($section);
281	} #while defined section
282	print $handle "$text\n$documentendtag";
283	#print STDOUT "$text\n$documentendtag";
284	}
285
286	# /** We make this builder pretend to be a document processor so we can get
287	# * information back from the plugins.
288	# *
289	# * @param $self A reference to this Lucene builder
290	# * @param $doc_obj A reference to a document object representing what was
291	# * parsed by the GAPlug
292	# * @param $file The name of the file parsed as a string
293	# *
294	# * @author John Thompson, DL Consulting Ltd
295	# */
296	sub process()
297	{
298	my $self = shift (@_);
299	my ($doc_obj, $file) = @_;
300
301	# If this is called from any stage other than an incremental infodb we want
302	# to pass through to the superclass of build
303	if ($self->get_mode() eq "incinfodb")
304	{
305	print STDERR "* Processing a document added using INCINFODB *\n";
306	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
307	$archivedir = "" unless defined $archivedir;
308	$archivedir =~ s/\\/\//g;
309	$archivedir =~ s/^\/+//;
310	$archivedir =~ s/\/+$//;
311
312	# Number of files
313	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315	# resolve the final filenames of the files associated with this document
316	$self->assoc_files ($doc_obj, $archivedir);
317
318	# is this a paged or a hierarchical document
319	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321	# Determine the actual docnum by checking if we've processed any
322	# previous incrementally added documents. If so, carry on from there.
323	# Otherwise we set the counter to be the same as the number of
324	# sections encountered during the previous build
325	if ($self->{'numincdocs'} == 0)
326	{
327	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328	}
329
330	my $section = $doc_obj->get_top_section ();
331	print STDERR "+ top section: '$section'\n";
332	my $doc_OID = $doc_obj->get_OID();
333	my $url = "";
334	while (defined $section)
335	{
336	print STDERR "+ processing section: '$section'\n";
337	# Attach all the other metadata to this document
338	# output the fact that this document is a document (unless doctype
339	# has been set to something else from within a plugin
340	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341	if (!defined $dtype \|\| $dtype !~ /\w/)
342	{
343	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344	}
345	# output whether this node contains text
346	if ($doc_obj->get_text_length($section) > 0)
347	{
348	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
349	}
350	else
351	{
352	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
353	}
354
355	# output archivedir if at top level
356	if ($section eq $doc_obj->get_top_section())
357	{
358	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360	}
361
362	# output a list of children
363	my $children = $doc_obj->get_children ($section);
364	if (scalar(@$children) > 0)
365	{
366	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367	my @contains = ();
368	foreach my $child (@$children)
369	{
370	if ($child =~ /^.*?\.(\d+)$/)
371	{
372	push (@contains, "\".$1");
373	}
374	else
375	{
376	push (@contains, "\".$child");
377	}
378	}
379	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380	}
381	#output the matching doc number
382	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385	$self->{'numincdocs'}++;
386	$section = $doc_obj->get_next_section($section);
387	# if no sections wanted, only add the docs
388	last if ($self->{'db_level'} eq "document");
389	}
390	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392	}
393	else
394	{
395	$self->mgppbuildproc::process(@_);
396	}
397	}
398	# / process() /
399
400
401	# Following methods seem to be no different to those defined in basebuildproc.pm
402	# From inspection, it looks like these ones can be removed
403
404
405	sub get_num_docs {
406	my $self = shift (@_);
407	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408	return $self->{'num_docs'};
409	}
410
411	sub get_num_sections {
412	my $self = shift (@_);
413	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414	return $self->{'num_sections'};
415	}
416
417	# num_bytes is the actual number of bytes in the collection
418	# this is normally the same as what's processed during text compression
419	sub get_num_bytes {
420	my $self = shift (@_);
421	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422	return $self->{'num_bytes'};
423	}
424
425
426	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427	# Otherwise the removal of tags below might lead to Lucene turning
428	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429	# (example from demo collection b20cre)
430	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431	sub preprocess_text
432	{
433	my $self = shift (@_);
434	my ($text, $strip_html, $para) = @_;
435	# at this stage, we do not do paragraph tags unless have strip_html -
436	# it will result in a huge mess of non-xml
437	return unless $strip_html;
438
439	my $new_text = $text;
440
441	# if we have <pre> tags, we can have < > inside them, need to delete
442	# the <> before stripping tags
443	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445	if ($para eq "") {
446	# just remove all tags
447	$new_text =~ s/<[^>]*>/ /gs;
448	} else {
449	# strip all tags except <p> tags which get turned into $para
450	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451	}
452
453	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454	# may not be valid XML (eg. if HTML-only entities like   are used)
455	$new_text =~ s/&\w{1,10};//g;
456	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457	$new_text =~ s/&([^\#])/ $1/g;
458
459	return $new_text;
460	}
461
462
463	1;
464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: