Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 15687

Last change on this file since 15687 was 15687, checked in by mdewsnip, 16 years ago
Removed a couple of references to gdbm.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_level = $mgppbuildproc::level_map{'document'};
84	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
85
86	my $levels = $self->{'levels'};
87	my $ldoc_level = $levels->{'document'};
88	my $lsec_level = $levels->{'section'};
89	my $lpar_level = $levels->{'paragraph'};
90
91	my $docid="";
92	if ($ldoc_level)
93	{
94	if ($self->{'db_level'} eq 'document')
95	{
96	my $doc_sec_num = $self->{'num_docs'};
97	$docid = "gs2:id=\"$doc_sec_num\"";
98	}
99	else
100	{
101	# default is section level
102	my $doc_sec_num = $self->{'num_sections'} + 1;
103	$docid = "gs2:id=\"$doc_sec_num\"";
104	}
105	}
106	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
107	my $documentendtag = "\n</$doc_level>\n";
108
109	my ($sectiontag) = "";
110	if ($lsec_level)
111	{
112	$sectiontag = $mgppbuildproc::level_map{'section'};
113	}
114	my ($parastarttag) = "";
115	my ($paraendtag) = "";
116	if ($self->{'levels'}->{'paragraph'})
117	{
118	if ($self->{'strip_html'})
119	{
120	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
121	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
122	}
123	else
124	{
125	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126	}
127	}
128
129	my $doc_section = 0; # just for this document
130
131	my $text = "";
132	$text .= $documenttag;
133	# get the text for this document
134	my $section = $doc_obj->get_top_section();
135	while (defined $section)
136	{
137	# update a few statistics
138	$doc_section++;
139	$self->{'num_sections'}++;
140
141	if ($sectiontag ne "")
142	{
143	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
144	$text .= "\n<$sectiontag $secid >\n";
145	}
146
147	# if we are doing subcollections, then some docs shouldn't be indexed.
148	# but we need to put the section tag placeholders in there so the
149	# sections match up with database
150	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
151	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
152	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
153	$section = $doc_obj->get_next_section($section);
154	next;
155	}
156
157	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
158	foreach my $field (split (/;/, $fields))
159	{
160	# only deal with this field if it doesn't start with top or
161	# this is the first section
162	my $real_field = $field;
163	next if (($real_field =~ s/^top//) && ($doc_section != 1));
164
165	my $new_text = "";
166	my $tmp_text = "";
167
168	# If allfields is requested add all metadata fields and text as
169	# belonging to the ZZ field
170	if ($real_field eq "allfields") {
171	# Text first - no html nor paragraph tags
172	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
173	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
174	&ghtml::htmlsafe($tmp_text);
175	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
176	# Then Metadata
177	my $metadata = $doc_obj->get_all_metadata ($section);
178	foreach my $pair (@$metadata) {
179	my ($mfield, $mvalue) = (@$pair);
180	&ghtml::htmlsafe($mvalue);
181	# check fields here, maybe others dont want - change to use dontindex!!
182	if ($mfield ne "Identifier"
183	&& $mfield !~ /^gsdl/
184	&& $mfield ne "classifytype"
185	&& $mfield ne "assocfilepath"
186	&& defined $mvalue && $mvalue ne "") {
187	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
188	}
189	if (!defined $self->{'indexfields'}->{$mfield}) {
190	$self->{'indexfields'}->{$mfield} = 1;
191	}
192	}
193	}
194	# metadata - output all metadata we know about except gsdl stuff
195	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
196	my $shortname = "";
197	my $metadata = $doc_obj->get_all_metadata ($section);
198	foreach my $pair (@$metadata) {
199	my ($mfield, $mvalue) = (@$pair);
200	&ghtml::htmlsafe($mvalue);
201	# check fields here, maybe others dont want - change to use dontindex!!
202	if ($mfield ne "Identifier"
203	&& $mfield !~ /^gsdl/
204	&& $mfield ne "classifytype"
205	&& $mfield ne "assocfilepath"
206	&& defined $mvalue && $mvalue ne "") {
207
208	if (defined $self->{'indexfieldmap'}->{$mfield}) {
209	$shortname = $self->{'indexfieldmap'}->{$mfield};
210	}
211	else {
212	$shortname = $self->create_shortname($mfield);
213	$self->{'indexfieldmap'}->{$mfield} = $shortname;
214	$self->{'indexfieldmap'}->{$shortname} = 1;
215	}
216	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
217	if (!defined $self->{'indexfields'}->{$mfield}) {
218	$self->{'indexfields'}->{$mfield} = 1;
219	}
220	}
221	}
222	}
223	else {
224	#individual metadata and or text specified - could be a comma separated list
225	my $shortname="";
226	if (defined $self->{'indexfieldmap'}->{$real_field}) {
227	$shortname = $self->{'indexfieldmap'}->{$real_field};
228	}
229	else {
230	$shortname = $self->create_shortname($real_field);
231	$self->{'indexfieldmap'}->{$real_field} = $shortname;
232	$self->{'indexfieldmap'}->{$shortname} = 1;
233	}
234
235	my @metadata_list = ();
236	foreach my $submeta (split /,/, $real_field) {
237	if ($submeta eq "text") {
238	my $section_text = $doc_obj->get_text($section);
239	if ($self->{'indexing_text'}) {
240	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
241	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
242	if ($parastarttag ne "") {
243	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
244	}
245	else {
246	# we don't want to individually tag each paragraph if not doing para indexing
247	$section_text = $self->preprocess_text($section_text, 1, "");
248	}
249	$new_text .= "$section_text</$shortname>$paraendtag\n";
250	}
251	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
252	$tmp_text .= $doc_obj->get_text ($section);
253	&ghtml::htmlsafe($tmp_text);
254	$new_text .= $tmp_text;
255	}
256	}
257	else {
258	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
259	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
260	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
261	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
262	}
263	}
264	push (@metadata_list, @section_metadata);
265	}
266	}
267	foreach my $item (@metadata_list) {
268	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269	}
270	}
271	# filter the text
272	$self->filter_text ($field, $new_text);
273	$self->{'num_processed_bytes'} += length ($new_text);
274
275	$text .= "$new_text";
276	} # foreach field
277
278	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
279
280	$section = $doc_obj->get_next_section($section);
281	} #while defined section
282	print $handle "$text\n$documentendtag";
283	#print STDOUT "$text\n$documentendtag";
284	}
285
286	# /** We make this builder pretend to be a document processor so we can get
287	# * information back from the plugins.
288	# *
289	# * @param $self A reference to this Lucene builder
290	# * @param $doc_obj A reference to a document object representing what was
291	# * parsed by the GAPlug
292	# * @param $file The name of the file parsed as a string
293	# *
294	# * @author John Thompson, DL Consulting Ltd
295	# */
296	sub process()
297	{
298	my $self = shift (@_);
299	my ($doc_obj, $file) = @_;
300
301	# If this is called from any stage other than an incremental infodb we want
302	# to pass through to the superclass of build
303	if ($self->get_mode() eq "incinfodb")
304	{
305	print STDERR "* Processing a document added using INCINFODB *\n";
306	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
307	$archivedir = "" unless defined $archivedir;
308	$archivedir =~ s/\\/\//g;
309	$archivedir =~ s/^\/+//;
310	$archivedir =~ s/\/+$//;
311
312	# Number of files
313	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315	# resolve the final filenames of the files associated with this document
316	$self->assoc_files ($doc_obj, $archivedir);
317
318	# is this a paged or a hierarchical document
319	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321	# Determine the actual docnum by checking if we've processed any
322	# previous incrementally added documents. If so, carry on from there.
323	# Otherwise we set the counter to be the same as the number of
324	# sections encountered during the previous build
325	if ($self->{'numincdocs'} == 0)
326	{
327	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328	}
329
330	my $section = $doc_obj->get_top_section ();
331	print STDERR "+ top section: '$section'\n";
332	my $doc_OID = $doc_obj->get_OID();
333	my $url = "";
334	while (defined $section)
335	{
336	print STDERR "+ processing section: '$section'\n";
337	# Attach all the other metadata to this document
338	# output the fact that this document is a document (unless doctype
339	# has been set to something else from within a plugin
340	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341	if (!defined $dtype \|\| $dtype !~ /\w/)
342	{
343	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344	}
345	# output whether this node contains text
346	if ($doc_obj->get_text_length($section) > 0)
347	{
348	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
349	}
350	else
351	{
352	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
353	}
354
355	# output archivedir if at top level
356	if ($section eq $doc_obj->get_top_section())
357	{
358	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360	}
361
362	# output a list of children
363	my $children = $doc_obj->get_children ($section);
364	if (scalar(@$children) > 0)
365	{
366	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367	my @contains = ();
368	foreach my $child (@$children)
369	{
370	if ($child =~ /^.*?\.(\d+)$/)
371	{
372	push (@contains, "\".$1");
373	}
374	else
375	{
376	push (@contains, "\".$child");
377	}
378	}
379	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380	}
381	#output the matching doc number
382	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385	$self->{'numincdocs'}++;
386	$section = $doc_obj->get_next_section($section);
387	# if no sections wanted, only add the docs
388	last if ($self->{'db_level'} eq "document");
389	}
390	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392	}
393	else
394	{
395	$self->mgppbuildproc::process(@_);
396	}
397	}
398	# / process() /
399
400
401	# Following methods seem to be no different to those defined in basebuildproc.pm
402	# From inspection, it looks like these ones can be removed
403
404
405	sub get_num_docs {
406	my $self = shift (@_);
407	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408	return $self->{'num_docs'};
409	}
410
411	sub get_num_sections {
412	my $self = shift (@_);
413	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414	return $self->{'num_sections'};
415	}
416
417	# num_bytes is the actual number of bytes in the collection
418	# this is normally the same as what's processed during text compression
419	sub get_num_bytes {
420	my $self = shift (@_);
421	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422	return $self->{'num_bytes'};
423	}
424
425
426	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427	# Otherwise the removal of tags below might lead to Lucene turning
428	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429	# (example from demo collection b20cre)
430	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431	sub preprocess_text
432	{
433	my $self = shift (@_);
434	my ($text, $strip_html, $para) = @_;
435	# at this stage, we do not do paragraph tags unless have strip_html -
436	# it will result in a huge mess of non-xml
437	return unless $strip_html;
438
439	my $new_text = $text;
440
441	# if we have <pre> tags, we can have < > inside them, need to delete
442	# the <> before stripping tags
443	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445	if ($para eq "") {
446	# just remove all tags
447	$new_text =~ s/<[^>]*>/ /gs;
448	} else {
449	# strip all tags except <p> tags which get turned into $para
450	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451	}
452
453	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454	# may not be valid XML (eg. if HTML-only entities like   are used)
455	$new_text =~ s/&\w{1,10};//g;
456	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457	$new_text =~ s/&([^\#])/ $1/g;
458
459	return $new_text;
460	}
461
462
463	1;
464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: