Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago
Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub text {
66	my $self = shift (@_);
67	my ($doc_obj,$file) = @_;
68	my $handle = $self->{'output_handle'};
69	my $outhandle = $self->{'outhandle'};
70
71	# only output this document if it is one to be indexed
72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76	# this is another document
77	$self->{'num_docs'} += 1;
78
79	# get the parameters for the output
80	# split on : just in case there is subcoll and lang stuff
81	my ($fields) = split (/:/, $self->{'index'});
82
83	my $doc_level = $mgppbuildproc::level_map{'document'};
84	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
85
86	my $levels = $self->{'levels'};
87	my $ldoc_level = $levels->{'document'};
88	my $lsec_level = $levels->{'section'};
89	my $lpar_level = $levels->{'paragraph'};
90
91	my $docid="";
92	if ($ldoc_level)
93	{
94	if ($self->{'gdbm_level'} eq 'document')
95	{
96	my $doc_sec_num = $self->{'num_docs'};
97	$docid = "gs2:id=\"$doc_sec_num\"";
98	}
99	else
100	{
101	# default is section level
102	my $doc_sec_num = $self->{'num_sections'} + 1;
103	$docid = "gs2:id=\"$doc_sec_num\"";
104	}
105	}
106	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
107	my $documentendtag = "\n</$doc_level>\n";
108
109	my ($sectiontag) = "";
110	if ($lsec_level)
111	{
112	$sectiontag = $mgppbuildproc::level_map{'section'};
113	}
114	my ($parastarttag) = "";
115	my ($paraendtag) = "";
116	if ($self->{'levels'}->{'paragraph'})
117	{
118	if ($self->{'strip_html'})
119	{
120	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
121	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
122	}
123	else
124	{
125	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126	}
127	}
128
129	my $doc_section = 0; # just for this document
130
131	my $text = "";
132	$text .= $documenttag;
133	# get the text for this document
134	my $section = $doc_obj->get_top_section();
135	while (defined $section)
136	{
137	# update a few statistics
138	$doc_section++;
139	$self->{'num_sections'}++;
140
141	if ($sectiontag ne "")
142	{
143	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
144	$text .= "\n<$sectiontag $secid >\n";
145	}
146
147	# if we are doing subcollections, then some docs shouldn't be indexed.
148	# but we need to put the section tag placeholders in there so the
149	# sections match up with gdbm db
150	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
151	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
152	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
153	$section = $doc_obj->get_next_section($section);
154	next;
155	}
156
157	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
158	foreach my $field (split (/;/, $fields))
159	{
160	# only deal with this field if it doesn't start with top or
161	# this is the first section
162	my $real_field = $field;
163	next if (($real_field =~ s/^top//) && ($doc_section != 1));
164
165	my $new_text = "";
166	my $tmp_text = "";
167
168	# If allfields is requested add all metadata fields and text as
169	# belonging to the ZZ field
170	if ($real_field eq "allfields") {
171	# Text first - no html nor paragraph tags
172	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
173	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
174	&ghtml::htmlsafe($tmp_text);
175	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
176	# Then Metadata
177	my $metadata = $doc_obj->get_all_metadata ($section);
178	foreach my $pair (@$metadata) {
179	my ($mfield, $mvalue) = (@$pair);
180	&ghtml::htmlsafe($mvalue);
181	# check fields here, maybe others dont want - change to use dontindex!!
182	if ($mfield ne "Identifier"
183	&& $mfield !~ /^gsdl/
184	&& $mfield ne "classifytype"
185	&& $mfield ne "assocfilepath"
186	&& defined $mvalue && $mvalue ne "") {
187	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
188	}
189	if (!defined $self->{'indexfields'}->{$mfield}) {
190	$self->{'indexfields'}->{$mfield} = 1;
191	}
192	}
193	}
194	# metadata - output all metadata we know about except gsdl stuff
195	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
196	my $shortname = "";
197	my $metadata = $doc_obj->get_all_metadata ($section);
198	foreach my $pair (@$metadata) {
199	my ($mfield, $mvalue) = (@$pair);
200	&ghtml::htmlsafe($mvalue);
201	# check fields here, maybe others dont want - change to use dontindex!!
202	if ($mfield ne "Identifier"
203	&& $mfield !~ /^gsdl/
204	&& $mfield ne "classifytype"
205	&& $mfield ne "assocfilepath"
206	&& defined $mvalue && $mvalue ne "") {
207
208	if (defined $self->{'indexfieldmap'}->{$mfield}) {
209	$shortname = $self->{'indexfieldmap'}->{$mfield};
210	}
211	else {
212	$shortname = $self->create_shortname($mfield);
213	$self->{'indexfieldmap'}->{$mfield} = $shortname;
214	$self->{'indexfieldmap'}->{$shortname} = 1;
215	}
216	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
217	if (!defined $self->{'indexfields'}->{$mfield}) {
218	$self->{'indexfields'}->{$mfield} = 1;
219	}
220	}
221	}
222	}
223	else {
224	#individual metadata and or text specified - could be a comma separated list
225	my $shortname="";
226	if (defined $self->{'indexfieldmap'}->{$real_field}) {
227	$shortname = $self->{'indexfieldmap'}->{$real_field};
228	}
229	else {
230	$shortname = $self->create_shortname($real_field);
231	$self->{'indexfieldmap'}->{$real_field} = $shortname;
232	$self->{'indexfieldmap'}->{$shortname} = 1;
233	}
234
235	my @metadata_list = ();
236	foreach my $submeta (split /,/, $real_field) {
237	if ($submeta eq "text") {
238	my $section_text = $doc_obj->get_text($section);
239	if ($self->{'indexing_text'}) {
240	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
241	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
242	if ($parastarttag ne "") {
243	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
244	}
245	else {
246	# we don't want to individually tag each paragraph if not doing para indexing
247	$section_text = $self->preprocess_text($section_text, 1, "");
248	}
249	$new_text .= "$section_text</$shortname>$paraendtag\n";
250	}
251	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
252	$tmp_text .= $doc_obj->get_text ($section);
253	&ghtml::htmlsafe($tmp_text);
254	$new_text .= $tmp_text;
255	}
256	}
257	else {
258	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
259	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
260	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
261	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
262	}
263	}
264	push (@metadata_list, @section_metadata);
265	}
266	}
267	foreach my $item (@metadata_list) {
268	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269	}
270	}
271	# filter the text
272	$self->filter_text ($field, $new_text);
273	$self->{'num_processed_bytes'} += length ($new_text);
274
275	$text .= "$new_text";
276	} # foreach field
277
278	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
279
280	$section = $doc_obj->get_next_section($section);
281	} #while defined section
282	print $handle "$text\n$documentendtag";
283	#print STDOUT "$text\n$documentendtag";
284	}
285
286	# /** We make this builder pretend to be a document processor so we can get
287	# * information back from the plugins.
288	# *
289	# * @param $self A reference to this Lucene builder
290	# * @param $doc_obj A reference to a document object representing what was
291	# * parsed by the GAPlug
292	# * @param $file The name of the file parsed as a string
293	# *
294	# * @author John Thompson, DL Consulting Ltd
295	# */
296	sub process()
297	{
298	my $self = shift (@_);
299	my ($doc_obj, $file) = @_;
300
301	# If this is called from any stage other than an incremental infodb we want
302	# to pass through to the superclass of build
303	if ($self->get_mode() eq "incinfodb")
304	{
305	print STDERR "* Processing a document added using INCINFODB *\n";
306	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
307	$archivedir = "" unless defined $archivedir;
308	$archivedir =~ s/\\/\//g;
309	$archivedir =~ s/^\/+//;
310	$archivedir =~ s/\/+$//;
311
312	# Number of files
313	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315	# resolve the final filenames of the files associated with this document
316	$self->assoc_files ($doc_obj, $archivedir);
317
318	# is this a paged or a hierarchical document
319	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321	# Determine the actual docnum by checking if we've processed any
322	# previous incrementally added documents. If so, carry on from there.
323	# Otherwise we set the counter to be the same as the number of
324	# sections encountered during the previous build
325	if ($self->{'numincdocs'} == 0)
326	{
327	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328	}
329
330	my $section = $doc_obj->get_top_section ();
331	print STDERR "+ top section: '$section'\n";
332	my $doc_OID = $doc_obj->get_OID();
333	my $url = "";
334	while (defined $section)
335	{
336	print STDERR "+ processing section: '$section'\n";
337	# Attach all the other metadata to this document
338	# output the fact that this document is a document (unless doctype
339	# has been set to something else from within a plugin
340	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341	if (!defined $dtype \|\| $dtype !~ /\w/)
342	{
343	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344	}
345	# output whether this node contains text
346	if ($doc_obj->get_text_length($section) > 0)
347	{
348	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
349	}
350	else
351	{
352	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
353	}
354
355	# output archivedir if at top level
356	if ($section eq $doc_obj->get_top_section())
357	{
358	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360	}
361
362	# output a list of children
363	my $children = $doc_obj->get_children ($section);
364	if (scalar(@$children) > 0)
365	{
366	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367	my @contains = ();
368	foreach my $child (@$children)
369	{
370	if ($child =~ /^.*?\.(\d+)$/)
371	{
372	push (@contains, "\".$1");
373	}
374	else
375	{
376	push (@contains, "\".$child");
377	}
378	}
379	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380	}
381	#output the matching doc number
382	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385	$self->{'numincdocs'}++;
386	$section = $doc_obj->get_next_section($section);
387	# if no sections wanted, only gdbm the docs
388	last if ($self->{'gdbm_level'} eq "document");
389	}
390	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392	}
393	else
394	{
395	$self->mgppbuildproc::process(@_);
396	}
397	}
398	# / process() /
399
400
401	# Following methods seem to be no different to those defined in basebuildproc.pm
402	# From inspection, it looks like these ones can be removed
403
404
405	sub get_num_docs {
406	my $self = shift (@_);
407	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408	return $self->{'num_docs'};
409	}
410
411	sub get_num_sections {
412	my $self = shift (@_);
413	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414	return $self->{'num_sections'};
415	}
416
417	# num_bytes is the actual number of bytes in the collection
418	# this is normally the same as what's processed during text compression
419	sub get_num_bytes {
420	my $self = shift (@_);
421	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422	return $self->{'num_bytes'};
423	}
424
425
426	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427	# Otherwise the removal of tags below might lead to Lucene turning
428	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429	# (example from demo collection b20cre)
430	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431	sub preprocess_text
432	{
433	my $self = shift (@_);
434	my ($text, $strip_html, $para) = @_;
435	# at this stage, we do not do paragraph tags unless have strip_html -
436	# it will result in a huge mess of non-xml
437	return unless $strip_html;
438
439	my $new_text = $text;
440
441	# if we have <pre> tags, we can have < > inside them, need to delete
442	# the <> before stripping tags
443	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445	if ($para eq "") {
446	# just remove all tags
447	$new_text =~ s/<[^>]*>/ /gs;
448	} else {
449	# strip all tags except <p> tags which get turned into $para
450	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451	}
452
453	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454	# may not be valid XML (eg. if HTML-only entities like   are used)
455	$new_text =~ s/&\w{1,10};//g;
456	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457	$new_text =~ s/&([^\#])/ $1/g;
458
459	return $new_text;
460	}
461
462
463	1;
464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: