Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago
changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc
Property svn:keywords set to `Author Date Id Revision`
File size: 16.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
84
85	my $levels = $self->{'levels'};
86	my $ldoc_level = $levels->{'document'};
87	my $lsec_level = $levels->{'section'};
88	my $lpar_level = $levels->{'paragraph'};
89
90	my $gs2_id = "";
91	if ($ldoc_level)
92	{
93	if ($self->{'db_level'} eq 'document')
94	{
95	$gs2_id = $self->{'num_docs'};
96	}
97	else
98	{
99	# default is section level
100	$gs2_id = $self->{'num_sections'} + 1;
101	}
102	}
103	my $gs2_docOID = $doc_obj->get_OID();
104	my $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\">\n";
105	my $documentendtag = "\n</$doc_tag_name>\n";
106
107	my $sec_tag_name = "";
108	if ($lsec_level)
109	{
110	$sec_tag_name = $mgppbuildproc::level_map{'section'};
111	}
112	my ($parastarttag) = "";
113	my ($paraendtag) = "";
114	if ($self->{'levels'}->{'paragraph'})
115	{
116	if ($self->{'strip_html'})
117	{
118	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
119	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
120	}
121	else
122	{
123	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
124	}
125	}
126
127	my $doc_section = 0; # just for this document
128
129	my $text = "";
130	$text .= $documenttag;
131	# get the text for this document
132	my $section = $doc_obj->get_top_section();
133	while (defined $section)
134	{
135	# update a few statistics
136	$doc_section++;
137	$self->{'num_sections'}++;
138
139	if ($sec_tag_name ne "")
140	{
141	my $sec_gs2_id = $self->{'num_sections'};
142	my $sec_gs2_docOID = $gs2_docOID . "." . $section;
143	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\">\n";
144	}
145
146	# if we are doing subcollections, then some docs shouldn't be indexed.
147	# but we need to put the section tag placeholders in there so the
148	# sections match up with database
149	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
150	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
151	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
152	$section = $doc_obj->get_next_section($section);
153	next;
154	}
155
156	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
157	foreach my $field (split (/;/, $fields))
158	{
159	# only deal with this field if it doesn't start with top or
160	# this is the first section
161	my $real_field = $field;
162	next if (($real_field =~ s/^top//) && ($doc_section != 1));
163
164	my $new_text = "";
165	my $tmp_text = "";
166
167	# If allfields is requested add all metadata fields and text as
168	# belonging to the ZZ field
169	if ($real_field eq "allfields") {
170	# Text first - no html nor paragraph tags
171	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
172	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
173	&ghtml::htmlsafe($tmp_text);
174	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
175	# Then Metadata
176	my $metadata = $doc_obj->get_all_metadata ($section);
177	foreach my $pair (@$metadata) {
178	my ($mfield, $mvalue) = (@$pair);
179	&ghtml::htmlsafe($mvalue);
180	# check fields here, maybe others dont want - change to use dontindex!!
181	if ($mfield ne "Identifier"
182	&& $mfield !~ /^gsdl/
183	&& $mfield ne "classifytype"
184	&& $mfield ne "assocfilepath"
185	&& defined $mvalue && $mvalue ne "") {
186	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
187	}
188	if (!defined $self->{'indexfields'}->{$mfield}) {
189	$self->{'indexfields'}->{$mfield} = 1;
190	}
191	}
192	}
193	# metadata - output all metadata we know about except gsdl stuff
194	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
195	my $shortname = "";
196	my $metadata = $doc_obj->get_all_metadata ($section);
197	foreach my $pair (@$metadata) {
198	my ($mfield, $mvalue) = (@$pair);
199	&ghtml::htmlsafe($mvalue);
200	# check fields here, maybe others dont want - change to use dontindex!!
201	if ($mfield ne "Identifier"
202	&& $mfield !~ /^gsdl/
203	&& $mfield ne "classifytype"
204	&& $mfield ne "assocfilepath"
205	&& defined $mvalue && $mvalue ne "") {
206
207	if (defined $self->{'indexfieldmap'}->{$mfield}) {
208	$shortname = $self->{'indexfieldmap'}->{$mfield};
209	}
210	else {
211	$shortname = $self->create_shortname($mfield);
212	$self->{'indexfieldmap'}->{$mfield} = $shortname;
213	$self->{'indexfieldmap'}->{$shortname} = 1;
214	}
215	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
216	if (!defined $self->{'indexfields'}->{$mfield}) {
217	$self->{'indexfields'}->{$mfield} = 1;
218	}
219	}
220	}
221	}
222	else {
223	#individual metadata and or text specified - could be a comma separated list
224	my $shortname="";
225	if (defined $self->{'indexfieldmap'}->{$real_field}) {
226	$shortname = $self->{'indexfieldmap'}->{$real_field};
227	}
228	else {
229	$shortname = $self->create_shortname($real_field);
230	$self->{'indexfieldmap'}->{$real_field} = $shortname;
231	$self->{'indexfieldmap'}->{$shortname} = 1;
232	}
233
234	my @metadata_list = ();
235	foreach my $submeta (split /,/, $real_field) {
236	if ($submeta eq "text") {
237	my $section_text = $doc_obj->get_text($section);
238	if ($self->{'indexing_text'}) {
239	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
240	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
241	if ($parastarttag ne "") {
242	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
243	}
244	else {
245	# we don't want to individually tag each paragraph if not doing para indexing
246	$section_text = $self->preprocess_text($section_text, 1, "");
247	}
248	$new_text .= "$section_text</$shortname>$paraendtag\n";
249	}
250	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
251	$tmp_text .= $doc_obj->get_text ($section);
252	&ghtml::htmlsafe($tmp_text);
253	$new_text .= $tmp_text;
254	}
255	}
256	else {
257	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
258	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
259	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
260	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
261	}
262	}
263	push (@metadata_list, @section_metadata);
264	}
265	}
266	foreach my $item (@metadata_list) {
267	&ghtml::htmlsafe($item);
268	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269	}
270	}
271	# filter the text
272	$new_text = $self->filter_text ($field, $new_text);
273	$self->{'num_processed_bytes'} += length ($new_text);
274
275	$text .= "$new_text";
276	} # foreach field
277
278	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
279
280	$section = $doc_obj->get_next_section($section);
281	} #while defined section
282	print $handle "$text\n$documentendtag";
283	#print STDOUT "$text\n$documentendtag";
284	}
285
286	# /** We make this builder pretend to be a document processor so we can get
287	# * information back from the plugins.
288	# *
289	# * @param $self A reference to this Lucene builder
290	# * @param $doc_obj A reference to a document object representing what was
291	# * parsed by the GAPlug
292	# * @param $file The name of the file parsed as a string
293	# *
294	# * @author John Thompson, DL Consulting Ltd
295	# */
296	sub process()
297	{
298	my $self = shift (@_);
299	my ($doc_obj, $file) = @_;
300
301	# If this is called from any stage other than an incremental infodb we want
302	# to pass through to the superclass of build
303	if ($self->get_mode() eq "incinfodb")
304	{
305	print STDERR "* Processing a document added using INCINFODB *\n";
306	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
307	$archivedir = "" unless defined $archivedir;
308	$archivedir =~ s/\\/\//g;
309	$archivedir =~ s/^\/+//;
310	$archivedir =~ s/\/+$//;
311
312	# Number of files
313	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315	# resolve the final filenames of the files associated with this document
316	$self->assoc_files ($doc_obj, $archivedir);
317
318	# is this a paged or a hierarchical document
319	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321	# Determine the actual docnum by checking if we've processed any
322	# previous incrementally added documents. If so, carry on from there.
323	# Otherwise we set the counter to be the same as the number of
324	# sections encountered during the previous build
325	if ($self->{'numincdocs'} == 0)
326	{
327	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328	}
329
330	my $section = $doc_obj->get_top_section ();
331	print STDERR "+ top section: '$section'\n";
332	my $doc_OID = $doc_obj->get_OID();
333	my $url = "";
334	while (defined $section)
335	{
336	print STDERR "+ processing section: '$section'\n";
337	# Attach all the other metadata to this document
338	# output the fact that this document is a document (unless doctype
339	# has been set to something else from within a plugin
340	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341	if (!defined $dtype \|\| $dtype !~ /\w/)
342	{
343	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344	}
345	# output whether this node contains text
346	if ($doc_obj->get_text_length($section) > 0)
347	{
348	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
349	}
350	else
351	{
352	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
353	}
354
355	# output archivedir if at top level
356	if ($section eq $doc_obj->get_top_section())
357	{
358	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360	}
361
362	# output a list of children
363	my $children = $doc_obj->get_children ($section);
364	if (scalar(@$children) > 0)
365	{
366	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367	my @contains = ();
368	foreach my $child (@$children)
369	{
370	if ($child =~ /^.*?\.(\d+)$/)
371	{
372	push (@contains, "\".$1");
373	}
374	else
375	{
376	push (@contains, "\".$child");
377	}
378	}
379	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380	}
381	#output the matching doc number
382	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385	$self->{'numincdocs'}++;
386	$section = $doc_obj->get_next_section($section);
387	# if no sections wanted, only add the docs
388	last if ($self->{'db_level'} eq "document");
389	}
390	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392	}
393	else
394	{
395	$self->mgppbuildproc::process(@_);
396	}
397	}
398	# / process() /
399
400
401	# Following methods seem to be no different to those defined in basebuildproc.pm
402	# From inspection, it looks like these ones can be removed
403
404
405	sub get_num_docs {
406	my $self = shift (@_);
407	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408	return $self->{'num_docs'};
409	}
410
411	sub get_num_sections {
412	my $self = shift (@_);
413	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414	return $self->{'num_sections'};
415	}
416
417	# num_bytes is the actual number of bytes in the collection
418	# this is normally the same as what's processed during text compression
419	sub get_num_bytes {
420	my $self = shift (@_);
421	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422	return $self->{'num_bytes'};
423	}
424
425
426	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427	# Otherwise the removal of tags below might lead to Lucene turning
428	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429	# (example from demo collection b20cre)
430	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431	sub preprocess_text
432	{
433	my $self = shift (@_);
434	my ($text, $strip_html, $para) = @_;
435	# at this stage, we do not do paragraph tags unless have strip_html -
436	# it will result in a huge mess of non-xml
437	return unless $strip_html;
438
439	my $new_text = $text;
440
441	# if we have <pre> tags, we can have < > inside them, need to delete
442	# the <> before stripping tags
443	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445	if ($para eq "") {
446	# just remove all tags
447	$new_text =~ s/<[^>]*>/ /gs;
448	} else {
449	# strip all tags except <p> tags which get turned into $para
450	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451	}
452
453	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454	# may not be valid XML (eg. if HTML-only entities like   are used)
455	$new_text =~ s/&\w{1,10};//g;
456	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457	$new_text =~ s/&([^\#])/ $1/g;
458
459	return $new_text;
460	}
461
462
463	1;
464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: