Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 16431

Last change on this file since 16431 was 16431, checked in by mdewsnip, 16 years ago
Now passes the Greenstone document OID into GS2LuceneIndexer, to help support incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_level = $mgppbuildproc::level_map{'document'};
84
85	my $levels = $self->{'levels'};
86	my $ldoc_level = $levels->{'document'};
87	my $lsec_level = $levels->{'section'};
88	my $lpar_level = $levels->{'paragraph'};
89
90	my $gs2_id = "";
91	if ($ldoc_level)
92	{
93	if ($self->{'db_level'} eq 'document')
94	{
95	$gs2_id = $self->{'num_docs'};
96	}
97	else
98	{
99	# default is section level
100	$gs2_id = $self->{'num_sections'} + 1;
101	}
102	}
103	my $gs2_docOID = $doc_obj->get_OID();
104	my $documenttag = "<$doc_level xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105	my $documentendtag = "\n</$doc_level>\n";
106
107	my ($sectiontag) = "";
108	if ($lsec_level)
109	{
110	$sectiontag = $mgppbuildproc::level_map{'section'};
111	}
112	my ($parastarttag) = "";
113	my ($paraendtag) = "";
114	if ($self->{'levels'}->{'paragraph'})
115	{
116	if ($self->{'strip_html'})
117	{
118	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120	}
121	else
122	{
123	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124	}
125	}
126
127	my $doc_section = 0; # just for this document
128
129	my $text = "";
130	$text .= $documenttag;
131	# get the text for this document
132	my $section = $doc_obj->get_top_section();
133	while (defined $section)
134	{
135	# update a few statistics
136	$doc_section++;
137	$self->{'num_sections'}++;
138
139	if ($sectiontag ne "")
140	{
141	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
142	$text .= "\n<$sectiontag $secid >\n";
143	}
144
145	# if we are doing subcollections, then some docs shouldn't be indexed.
146	# but we need to put the section tag placeholders in there so the
147	# sections match up with database
148	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
149	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
150	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
151	$section = $doc_obj->get_next_section($section);
152	next;
153	}
154
155	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
156	foreach my $field (split (/;/, $fields))
157	{
158	# only deal with this field if it doesn't start with top or
159	# this is the first section
160	my $real_field = $field;
161	next if (($real_field =~ s/^top//) && ($doc_section != 1));
162
163	my $new_text = "";
164	my $tmp_text = "";
165
166	# If allfields is requested add all metadata fields and text as
167	# belonging to the ZZ field
168	if ($real_field eq "allfields") {
169	# Text first - no html nor paragraph tags
170	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
171	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
172	&ghtml::htmlsafe($tmp_text);
173	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
174	# Then Metadata
175	my $metadata = $doc_obj->get_all_metadata ($section);
176	foreach my $pair (@$metadata) {
177	my ($mfield, $mvalue) = (@$pair);
178	&ghtml::htmlsafe($mvalue);
179	# check fields here, maybe others dont want - change to use dontindex!!
180	if ($mfield ne "Identifier"
181	&& $mfield !~ /^gsdl/
182	&& $mfield ne "classifytype"
183	&& $mfield ne "assocfilepath"
184	&& defined $mvalue && $mvalue ne "") {
185	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
186	}
187	if (!defined $self->{'indexfields'}->{$mfield}) {
188	$self->{'indexfields'}->{$mfield} = 1;
189	}
190	}
191	}
192	# metadata - output all metadata we know about except gsdl stuff
193	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
194	my $shortname = "";
195	my $metadata = $doc_obj->get_all_metadata ($section);
196	foreach my $pair (@$metadata) {
197	my ($mfield, $mvalue) = (@$pair);
198	&ghtml::htmlsafe($mvalue);
199	# check fields here, maybe others dont want - change to use dontindex!!
200	if ($mfield ne "Identifier"
201	&& $mfield !~ /^gsdl/
202	&& $mfield ne "classifytype"
203	&& $mfield ne "assocfilepath"
204	&& defined $mvalue && $mvalue ne "") {
205
206	if (defined $self->{'indexfieldmap'}->{$mfield}) {
207	$shortname = $self->{'indexfieldmap'}->{$mfield};
208	}
209	else {
210	$shortname = $self->create_shortname($mfield);
211	$self->{'indexfieldmap'}->{$mfield} = $shortname;
212	$self->{'indexfieldmap'}->{$shortname} = 1;
213	}
214	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
215	if (!defined $self->{'indexfields'}->{$mfield}) {
216	$self->{'indexfields'}->{$mfield} = 1;
217	}
218	}
219	}
220	}
221	else {
222	#individual metadata and or text specified - could be a comma separated list
223	my $shortname="";
224	if (defined $self->{'indexfieldmap'}->{$real_field}) {
225	$shortname = $self->{'indexfieldmap'}->{$real_field};
226	}
227	else {
228	$shortname = $self->create_shortname($real_field);
229	$self->{'indexfieldmap'}->{$real_field} = $shortname;
230	$self->{'indexfieldmap'}->{$shortname} = 1;
231	}
232
233	my @metadata_list = ();
234	foreach my $submeta (split /,/, $real_field) {
235	if ($submeta eq "text") {
236	my $section_text = $doc_obj->get_text($section);
237	if ($self->{'indexing_text'}) {
238	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
239	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
240	if ($parastarttag ne "") {
241	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
242	}
243	else {
244	# we don't want to individually tag each paragraph if not doing para indexing
245	$section_text = $self->preprocess_text($section_text, 1, "");
246	}
247	$new_text .= "$section_text</$shortname>$paraendtag\n";
248	}
249	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
250	$tmp_text .= $doc_obj->get_text ($section);
251	&ghtml::htmlsafe($tmp_text);
252	$new_text .= $tmp_text;
253	}
254	}
255	else {
256	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
257	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
258	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
259	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
260	}
261	}
262	push (@metadata_list, @section_metadata);
263	}
264	}
265	foreach my $item (@metadata_list) {
266	&ghtml::htmlsafe($item);
267	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
268	}
269	}
270	# filter the text
271	$self->filter_text ($field, $new_text);
272	$self->{'num_processed_bytes'} += length ($new_text);
273
274	$text .= "$new_text";
275	} # foreach field
276
277	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
278
279	$section = $doc_obj->get_next_section($section);
280	} #while defined section
281	print $handle "$text\n$documentendtag";
282	#print STDOUT "$text\n$documentendtag";
283	}
284
285	# /** We make this builder pretend to be a document processor so we can get
286	# * information back from the plugins.
287	# *
288	# * @param $self A reference to this Lucene builder
289	# * @param $doc_obj A reference to a document object representing what was
290	# * parsed by the GAPlug
291	# * @param $file The name of the file parsed as a string
292	# *
293	# * @author John Thompson, DL Consulting Ltd
294	# */
295	sub process()
296	{
297	my $self = shift (@_);
298	my ($doc_obj, $file) = @_;
299
300	# If this is called from any stage other than an incremental infodb we want
301	# to pass through to the superclass of build
302	if ($self->get_mode() eq "incinfodb")
303	{
304	print STDERR "* Processing a document added using INCINFODB *\n";
305	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
306	$archivedir = "" unless defined $archivedir;
307	$archivedir =~ s/\\/\//g;
308	$archivedir =~ s/^\/+//;
309	$archivedir =~ s/\/+$//;
310
311	# Number of files
312	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
313
314	# resolve the final filenames of the files associated with this document
315	$self->assoc_files ($doc_obj, $archivedir);
316
317	# is this a paged or a hierarchical document
318	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
319
320	# Determine the actual docnum by checking if we've processed any
321	# previous incrementally added documents. If so, carry on from there.
322	# Otherwise we set the counter to be the same as the number of
323	# sections encountered during the previous build
324	if ($self->{'numincdocs'} == 0)
325	{
326	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
327	}
328
329	my $section = $doc_obj->get_top_section ();
330	print STDERR "+ top section: '$section'\n";
331	my $doc_OID = $doc_obj->get_OID();
332	my $url = "";
333	while (defined $section)
334	{
335	print STDERR "+ processing section: '$section'\n";
336	# Attach all the other metadata to this document
337	# output the fact that this document is a document (unless doctype
338	# has been set to something else from within a plugin
339	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
340	if (!defined $dtype \|\| $dtype !~ /\w/)
341	{
342	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
343	}
344	# output whether this node contains text
345	if ($doc_obj->get_text_length($section) > 0)
346	{
347	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
348	}
349	else
350	{
351	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
352	}
353
354	# output archivedir if at top level
355	if ($section eq $doc_obj->get_top_section())
356	{
357	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
358	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
359	}
360
361	# output a list of children
362	my $children = $doc_obj->get_children ($section);
363	if (scalar(@$children) > 0)
364	{
365	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
366	my @contains = ();
367	foreach my $child (@$children)
368	{
369	if ($child =~ /^.*?\.(\d+)$/)
370	{
371	push (@contains, "\".$1");
372	}
373	else
374	{
375	push (@contains, "\".$child");
376	}
377	}
378	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
379	}
380	#output the matching doc number
381	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
382	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
383
384	$self->{'numincdocs'}++;
385	$section = $doc_obj->get_next_section($section);
386	# if no sections wanted, only add the docs
387	last if ($self->{'db_level'} eq "document");
388	}
389	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
390	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
391	}
392	else
393	{
394	$self->mgppbuildproc::process(@_);
395	}
396	}
397	# / process() /
398
399
400	# Following methods seem to be no different to those defined in basebuildproc.pm
401	# From inspection, it looks like these ones can be removed
402
403
404	sub get_num_docs {
405	my $self = shift (@_);
406	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
407	return $self->{'num_docs'};
408	}
409
410	sub get_num_sections {
411	my $self = shift (@_);
412	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
413	return $self->{'num_sections'};
414	}
415
416	# num_bytes is the actual number of bytes in the collection
417	# this is normally the same as what's processed during text compression
418	sub get_num_bytes {
419	my $self = shift (@_);
420	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
421	return $self->{'num_bytes'};
422	}
423
424
425	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
426	# Otherwise the removal of tags below might lead to Lucene turning
427	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
428	# (example from demo collection b20cre)
429	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
430	sub preprocess_text
431	{
432	my $self = shift (@_);
433	my ($text, $strip_html, $para) = @_;
434	# at this stage, we do not do paragraph tags unless have strip_html -
435	# it will result in a huge mess of non-xml
436	return unless $strip_html;
437
438	my $new_text = $text;
439
440	# if we have <pre> tags, we can have < > inside them, need to delete
441	# the <> before stripping tags
442	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
443
444	if ($para eq "") {
445	# just remove all tags
446	$new_text =~ s/<[^>]*>/ /gs;
447	} else {
448	# strip all tags except <p> tags which get turned into $para
449	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
450	}
451
452	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
453	# may not be valid XML (eg. if HTML-only entities like   are used)
454	$new_text =~ s/&\w{1,10};//g;
455	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
456	$new_text =~ s/&([^\#])/ $1/g;
457
458	return $new_text;
459	}
460
461
462	1;
463

Note: See TracBrowser for help on using the repository browser.

Download in other formats: