Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 16247

Last change on this file since 16247 was 15687, checked in by mdewsnip, 16 years ago
Removed a couple of references to gdbm.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.2 KB

Rev	Line
[8072]	1	###########################################################################
	2	#
	3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package lucenebuildproc;
	27
	28	# This document processor outputs a document
	29	# for lucene to process
	30
	31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
	32
[12844]	33	use mgppbuildproc;
[9186]	34	use ghtml;
[12424]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
[8072]	37
[12424]	38
[12844]	39	use IncrementalBuildUtils;
	40
[8072]	41	sub BEGIN {
[8716]	42	@lucenebuildproc::ISA = ('mgppbuildproc');
[8072]	43	}
	44
	45
	46	sub new {
	47	my $class = shift @_;
	48	my $self = new mgppbuildproc (@_);
	49
[12844]	50	$self->{'numincdocs'} = 0;
	51
[8072]	52	return bless $self, $class;
	53	}
	54
[10304]	55
[10419]	56	sub is_incremental_capable
[10304]	57	{
	58	my $self = shift (@_);
	59
	60	# Unlike MG and MGPP, Lucene supports incremental building
	61	return 1;
	62	}
	63
	64
[8072]	65	sub text {
	66	my $self = shift (@_);
	67	my ($doc_obj,$file) = @_;
	68	my $handle = $self->{'output_handle'};
	69	my $outhandle = $self->{'outhandle'};
	70
	71	# only output this document if it is one to be indexed
	72	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	73
[10961]	74	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[8072]	75
	76	# this is another document
	77	$self->{'num_docs'} += 1;
	78
	79	# get the parameters for the output
	80	# split on : just in case there is subcoll and lang stuff
	81	my ($fields) = split (/:/, $self->{'index'});
	82
	83	my $doc_level = $mgppbuildproc::level_map{'document'};
	84	my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
	85
	86	my $levels = $self->{'levels'};
	87	my $ldoc_level = $levels->{'document'};
	88	my $lsec_level = $levels->{'section'};
	89	my $lpar_level = $levels->{'paragraph'};
	90
	91	my $docid="";
[12844]	92	if ($ldoc_level)
	93	{
[15685]	94	if ($self->{'db_level'} eq 'document')
[12844]	95	{
	96	my $doc_sec_num = $self->{'num_docs'};
[9214]	97	$docid = "gs2:id=\"$doc_sec_num\"";
[12844]	98	}
	99	else
	100	{
[9214]	101	# default is section level
[12844]	102	my $doc_sec_num = $self->{'num_sections'} + 1;
[9186]	103	$docid = "gs2:id=\"$doc_sec_num\"";
[12844]	104	}
	105	}
[8072]	106	my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
	107	my $documentendtag = "\n</$doc_level>\n";
[12844]	108
	109	my ($sectiontag) = "";
	110	if ($lsec_level)
	111	{
[8072]	112	$sectiontag = $mgppbuildproc::level_map{'section'};
[12844]	113	}
[9178]	114	my ($parastarttag) = "";
	115	my ($paraendtag) = "";
[12844]	116	if ($self->{'levels'}->{'paragraph'})
	117	{
	118	if ($self->{'strip_html'})
	119	{
[9178]	120	$parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
	121	$paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
[12844]	122	}
	123	else
	124	{
[8072]	125	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
[12844]	126	}
	127	}
	128
[8072]	129	my $doc_section = 0; # just for this document
	130
	131	my $text = "";
	132	$text .= $documenttag;
	133	# get the text for this document
	134	my $section = $doc_obj->get_top_section();
[12844]	135	while (defined $section)
	136	{
[8072]	137	# update a few statistics
	138	$doc_section++;
[12844]	139	$self->{'num_sections'}++;
	140
	141	if ($sectiontag ne "")
	142	{
	143	my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
[8072]	144	$text .= "\n<$sectiontag $secid >\n";
[12844]	145	}
[8072]	146
[9178]	147	# if we are doing subcollections, then some docs shouldn't be indexed.
[12844]	148	# but we need to put the section tag placeholders in there so the
[15687]	149	# sections match up with database
[12274]	150	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
[12951]	151	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
[10961]	152	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
[12844]	153	$section = $doc_obj->get_next_section($section);
[10961]	154	next;
[12844]	155	}
	156
[10961]	157	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[12844]	158	foreach my $field (split (/;/, $fields))
	159	{
[10961]	160	# only deal with this field if it doesn't start with top or
	161	# this is the first section
	162	my $real_field = $field;
	163	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	164
[12844]	165	my $new_text = "";
	166	my $tmp_text = "";
[10961]	167
[12844]	168	# If allfields is requested add all metadata fields and text as
	169	# belonging to the ZZ field
	170	if ($real_field eq "allfields") {
	171	# Text first - no html nor paragraph tags
	172	$new_text .= "$parastarttag<ZZ index=\"1\">\n";
	173	$tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
	174	&ghtml::htmlsafe($tmp_text);
	175	$new_text .= "$tmp_text</ZZ>$paraendtag\n";
	176	# Then Metadata
	177	my $metadata = $doc_obj->get_all_metadata ($section);
	178	foreach my $pair (@$metadata) {
	179	my ($mfield, $mvalue) = (@$pair);
	180	&ghtml::htmlsafe($mvalue);
	181	# check fields here, maybe others dont want - change to use dontindex!!
	182	if ($mfield ne "Identifier"
	183	&& $mfield !~ /^gsdl/
	184	&& $mfield ne "classifytype"
	185	&& $mfield ne "assocfilepath"
	186	&& defined $mvalue && $mvalue ne "") {
	187	$new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
	188	}
	189	if (!defined $self->{'indexfields'}->{$mfield}) {
	190	$self->{'indexfields'}->{$mfield} = 1;
	191	}
	192	}
	193	}
[10961]	194	# metadata - output all metadata we know about except gsdl stuff
[12844]	195	elsif ($real_field eq "metadata" \|\| $real_field eq "allfields") {
[10961]	196	my $shortname = "";
	197	my $metadata = $doc_obj->get_all_metadata ($section);
[12424]	198	foreach my $pair (@$metadata) {
[10961]	199	my ($mfield, $mvalue) = (@$pair);
[12844]	200	&ghtml::htmlsafe($mvalue);
[10961]	201	# check fields here, maybe others dont want - change to use dontindex!!
	202	if ($mfield ne "Identifier"
	203	&& $mfield !~ /^gsdl/
	204	&& $mfield ne "classifytype"
	205	&& $mfield ne "assocfilepath"
	206	&& defined $mvalue && $mvalue ne "") {
[12844]	207
[10961]	208	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	209	$shortname = $self->{'indexfieldmap'}->{$mfield};
	210	}
	211	else {
	212	$shortname = $self->create_shortname($mfield);
	213	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	214	$self->{'indexfieldmap'}->{$shortname} = 1;
[12844]	215	}
[10961]	216	$new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
	217	if (!defined $self->{'indexfields'}->{$mfield}) {
	218	$self->{'indexfields'}->{$mfield} = 1;
[12844]	219	}
[10961]	220	}
	221	}
	222	}
[12844]	223	else {
[10961]	224	#individual metadata and or text specified - could be a comma separated list
	225	my $shortname="";
	226	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	227	$shortname = $self->{'indexfieldmap'}->{$real_field};
	228	}
	229	else {
	230	$shortname = $self->create_shortname($real_field);
	231	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	232	$self->{'indexfieldmap'}->{$shortname} = 1;
	233	}
[12844]	234
[10961]	235	my @metadata_list = ();
[12424]	236	foreach my $submeta (split /,/, $real_field) {
[10961]	237	if ($submeta eq "text") {
[12424]	238	my $section_text = $doc_obj->get_text($section);
	239	if ($self->{'indexing_text'}) {
[12426]	240	# tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
[10961]	241	$new_text .= "$parastarttag<$shortname index=\"1\">\n";
	242	if ($parastarttag ne "") {
[12426]	243	$section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
[12424]	244	}
	245	else {
[9178]	246	# we don't want to individually tag each paragraph if not doing para indexing
[12426]	247	$section_text = $self->preprocess_text($section_text, 1, "");
[12424]	248	}
	249	$new_text .= "$section_text</$shortname>$paraendtag\n";
[8072]	250	}
[12844]	251	else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
	252	$tmp_text .= $doc_obj->get_text ($section);
	253	&ghtml::htmlsafe($tmp_text);
	254	$new_text .= $tmp_text;
[8072]	255	}
[10961]	256	}
	257	else {
	258	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
[12371]	259	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
[10961]	260	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	261	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
[8072]	262	}
	263	}
[10961]	264	push (@metadata_list, @section_metadata);
[8072]	265	}
	266	}
[10961]	267	foreach my $item (@metadata_list) {
	268	$new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
	269	}
[8072]	270	}
[10961]	271	# filter the text
	272	$self->filter_text ($field, $new_text);
	273	$self->{'num_processed_bytes'} += length ($new_text);
[12844]	274
[10961]	275	$text .= "$new_text";
	276	} # foreach field
[12844]	277
[8072]	278	$text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
	279
[12844]	280	$section = $doc_obj->get_next_section($section);
[8072]	281	} #while defined section
[12844]	282	print $handle "$text\n$documentendtag";
[10961]	283	#print STDOUT "$text\n$documentendtag";
[8072]	284	}
	285
[12844]	286	# /** We make this builder pretend to be a document processor so we can get
	287	# * information back from the plugins.
	288	# *
	289	# * @param $self A reference to this Lucene builder
	290	# * @param $doc_obj A reference to a document object representing what was
	291	# * parsed by the GAPlug
	292	# * @param $file The name of the file parsed as a string
	293	# *
	294	# * @author John Thompson, DL Consulting Ltd
	295	# */
	296	sub process()
	297	{
	298	my $self = shift (@_);
	299	my ($doc_obj, $file) = @_;
	300
	301	# If this is called from any stage other than an incremental infodb we want
	302	# to pass through to the superclass of build
	303	if ($self->get_mode() eq "incinfodb")
	304	{
	305	print STDERR "* Processing a document added using INCINFODB *\n";
	306	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	307	$archivedir = "" unless defined $archivedir;
	308	$archivedir =~ s/\\/\//g;
	309	$archivedir =~ s/^\/+//;
	310	$archivedir =~ s/\/+$//;
	311
	312	# Number of files
	313	print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
	314
	315	# resolve the final filenames of the files associated with this document
	316	$self->assoc_files ($doc_obj, $archivedir);
	317
	318	# is this a paged or a hierarchical document
	319	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	320
	321	# Determine the actual docnum by checking if we've processed any
	322	# previous incrementally added documents. If so, carry on from there.
	323	# Otherwise we set the counter to be the same as the number of
	324	# sections encountered during the previous build
	325	if ($self->{'numincdocs'} == 0)
	326	{
	327	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
	328	}
	329
	330	my $section = $doc_obj->get_top_section ();
	331	print STDERR "+ top section: '$section'\n";
	332	my $doc_OID = $doc_obj->get_OID();
	333	my $url = "";
	334	while (defined $section)
	335	{
	336	print STDERR "+ processing section: '$section'\n";
	337	# Attach all the other metadata to this document
	338	# output the fact that this document is a document (unless doctype
	339	# has been set to something else from within a plugin
	340	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	341	if (!defined $dtype \|\| $dtype !~ /\w/)
	342	{
	343	$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
	344	}
	345	# output whether this node contains text
	346	if ($doc_obj->get_text_length($section) > 0)
	347	{
	348	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
	349	}
	350	else
	351	{
	352	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
	353	}
	354
	355	# output archivedir if at top level
	356	if ($section eq $doc_obj->get_top_section())
	357	{
	358	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
	359	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
	360	}
	361
	362	# output a list of children
	363	my $children = $doc_obj->get_children ($section);
	364	if (scalar(@$children) > 0)
	365	{
	366	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
	367	my @contains = ();
	368	foreach my $child (@$children)
	369	{
	370	if ($child =~ /^.*?\.(\d+)$/)
	371	{
	372	push (@contains, "\".$1");
	373	}
	374	else
	375	{
	376	push (@contains, "\".$child");
	377	}
	378	}
	379	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
	380	}
	381	#output the matching doc number
	382	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
	383	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
	384
	385	$self->{'numincdocs'}++;
	386	$section = $doc_obj->get_next_section($section);
[15687]	387	# if no sections wanted, only add the docs
[15685]	388	last if ($self->{'db_level'} eq "document");
[12844]	389	}
	390	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
	391	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
	392	}
	393	else
	394	{
	395	$self->mgppbuildproc::process(@_);
	396	}
	397	}
	398	# / process() /
	399
[14934]	400
	401	# Following methods seem to be no different to those defined in basebuildproc.pm
	402	# From inspection, it looks like these ones can be removed
	403
	404
[12844]	405	sub get_num_docs {
	406	my $self = shift (@_);
	407	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
	408	return $self->{'num_docs'};
	409	}
	410
	411	sub get_num_sections {
	412	my $self = shift (@_);
	413	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
	414	return $self->{'num_sections'};
	415	}
	416
	417	# num_bytes is the actual number of bytes in the collection
	418	# this is normally the same as what's processed during text compression
	419	sub get_num_bytes {
	420	my $self = shift (@_);
	421	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
	422	return $self->{'num_bytes'};
	423	}
	424
[14068]	425
	426	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
	427	# Otherwise the removal of tags below might lead to Lucene turning
	428	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
	429	# (example from demo collection b20cre)
	430	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
	431	sub preprocess_text
	432	{
	433	my $self = shift (@_);
	434	my ($text, $strip_html, $para) = @_;
	435	# at this stage, we do not do paragraph tags unless have strip_html -
	436	# it will result in a huge mess of non-xml
	437	return unless $strip_html;
	438
	439	my $new_text = $text;
	440
	441	# if we have <pre> tags, we can have < > inside them, need to delete
	442	# the <> before stripping tags
	443	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	444
	445	if ($para eq "") {
	446	# just remove all tags
	447	$new_text =~ s/<[^>]*>/ /gs;
	448	} else {
	449	# strip all tags except <p> tags which get turned into $para
	450	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	451	}
	452
[14923]	453	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
	454	# may not be valid XML (eg. if HTML-only entities like   are used)
	455	$new_text =~ s/&\w{1,10};//g;
	456	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
	457	$new_text =~ s/&([^\#])/ $1/g;
	458
[14068]	459	return $new_text;
	460	}
	461
	462
[8072]	463	1;
	464

Note: See TracBrowser for help on using the repository browser.

Download in other formats: