Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 23182

Last change on this file since 23182 was 23182, checked in by kjdon, 14 years ago
fixed up bug with deleting assoc files. Was fine for a delete, but for an update, need to delete the old ones before adding the new ones. And also, don't delete the lucene text doc when updating. Presumably it has already been overwritten with the correct version
Property svn:keywords set to `Author Date Id Revision`
File size: 18.5 KB

Rev	Line
[8072]	1	###########################################################################
	2	#
	3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package lucenebuildproc;
	27
	28	# This document processor outputs a document
	29	# for lucene to process
	30
	31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
	32
[12844]	33	use mgppbuildproc;
[9186]	34	use ghtml;
[12424]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
[8072]	37
[12424]	38
[12844]	39	use IncrementalBuildUtils;
	40
[8072]	41	sub BEGIN {
[8716]	42	@lucenebuildproc::ISA = ('mgppbuildproc');
[8072]	43	}
	44
	45
	46	sub new {
	47	my $class = shift @_;
	48	my $self = new mgppbuildproc (@_);
	49
[12844]	50	$self->{'numincdocs'} = 0;
	51
[8072]	52	return bless $self, $class;
	53	}
	54
[10304]	55
[10419]	56	sub is_incremental_capable
[10304]	57	{
	58	my $self = shift (@_);
	59
	60	# Unlike MG and MGPP, Lucene supports incremental building
	61	return 1;
	62	}
	63
	64
[18456]	65	sub textedit {
[8072]	66	my $self = shift (@_);
[18456]	67	my ($doc_obj,$file,$edit_mode) = @_;
	68
	69	my $lucenehandle = $self->{'output_handle'};
[8072]	70	my $outhandle = $self->{'outhandle'};
	71
	72	# only output this document if it is one to be indexed
	73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	74
[18456]	75	# skip this document if in "compress-text" mode and asked to delete it
	76	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
	77
[10961]	78	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[8072]	79
	80	# this is another document
[18471]	81	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	82	$self->{'num_docs'} += 1;
	83	}
	84	else {
	85	$self->{'num_docs'} -= 1;
	86	}
[8072]	87
	88	# get the parameters for the output
	89	# split on : just in case there is subcoll and lang stuff
	90	my ($fields) = split (/:/, $self->{'index'});
	91
[16504]	92	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
[8072]	93
	94	my $levels = $self->{'levels'};
	95	my $ldoc_level = $levels->{'document'};
	96	my $lsec_level = $levels->{'section'};
	97
[18456]	98	# gs2_id should be depricated #####
[16431]	99	my $gs2_id = "";
[12844]	100	if ($ldoc_level)
[16431]	101	{
[15685]	102	if ($self->{'db_level'} eq 'document')
[16431]	103	{
	104	$gs2_id = $self->{'num_docs'};
	105	}
[12844]	106	else
[16431]	107	{
[9214]	108	# default is section level
[16431]	109	$gs2_id = $self->{'num_sections'} + 1;
	110	}
	111	}
	112	my $gs2_docOID = $doc_obj->get_OID();
[18456]	113	my $documenttag = undef;
	114	my $documentendtag = undef;
[12844]	115
[20732]	116	#$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
	117	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
[18456]	118	$documentendtag = "\n</$doc_tag_name>\n";
	119
[16504]	120	my $sec_tag_name = "";
[12844]	121	if ($lsec_level)
[17568]	122	{
[16504]	123	$sec_tag_name = $mgppbuildproc::level_map{'section'};
[17568]	124	}
[12844]	125
[8072]	126	my $doc_section = 0; # just for this document
	127
	128	my $text = "";
	129	$text .= $documenttag;
	130	# get the text for this document
	131	my $section = $doc_obj->get_top_section();
[12844]	132	while (defined $section)
[17568]	133	{
[8072]	134	# update a few statistics
	135	$doc_section++;
[12844]	136	$self->{'num_sections'}++;
	137
[18456]	138	my $sec_gs2_id = $self->{'num_sections'};
	139	my $sec_gs2_docOID = $gs2_docOID;
	140	$sec_gs2_docOID .= ".$section" if ($section ne "");
[8072]	141
[9178]	142	# if we are doing subcollections, then some docs shouldn't be indexed.
[12844]	143	# but we need to put the section tag placeholders in there so the
[15687]	144	# sections match up with database
[12274]	145	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
[12951]	146	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
[18456]	147	if ($sec_tag_name ne "") {
[20732]	148	#$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
	149	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
[18456]	150	$text .= "\n</$sec_tag_name>\n"
	151	}
[12844]	152	$section = $doc_obj->get_next_section($section);
[10961]	153	next;
[12844]	154	}
	155
[18456]	156	if ($sec_tag_name ne "")
	157	{
[20732]	158	#$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
	159	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
[18456]	160	}
[17568]	161
[18471]	162	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	163	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	164	}
	165	else {
	166	# delete
	167	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	168	}
	169
	170
[17568]	171	# has the user added a 'metadata' index?
	172	my $all_metadata_specified = 0;
	173	# which fields have already been indexed? (same as fields, but in a map)
	174	my $specified_fields = {};
	175
	176	# do we have an allfields index??
	177	my $allfields_index = 0;
	178	# collect up all the text for it in here
	179	my $allfields_text = "";
	180	foreach my $field (split (/;/, $fields)) {
	181	if ($field eq "allfields") {
	182	$allfields_index = 1;
	183	} elsif ($field eq "metadata") {
	184	$all_metadata_specified = 1;
	185	}
	186	}
	187
	188	foreach my $field (split (/;/, $fields)) {
	189
[10961]	190	# only deal with this field if it doesn't start with top or
	191	# this is the first section
	192	my $real_field = $field;
	193	next if (($real_field =~ s/^top//) && ($doc_section != 1));
[17568]	194
	195	# process these two later
	196	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	197
	198	#individual metadata and or text specified - could be a comma separated list
	199	$specified_fields->{$real_field} = 1;
	200	my $shortname="";
	201	my $new_field = 0; # have we found a new field name?
	202	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	203	$shortname = $self->{'indexfieldmap'}->{$real_field};
[12844]	204	}
[17568]	205	else {
	206	$shortname = $self->create_shortname($real_field);
	207	$new_field = 1;
	208	}
[12844]	209
[17568]	210	my @metadata_list = (); # put any metadata values in here
	211	my $section_text = ""; # put the text in here
	212	foreach my $submeta (split /,/, $real_field) {
	213	if ($submeta eq "text") {
	214	# no point in indexing text more than once
	215	if ($section_text eq "") {
	216	$section_text = $doc_obj->get_text($section);
	217	if ($self->{'indexing_text'}) {
	218	# we always strip html
	219	$section_text = $self->preprocess_text($section_text, 1, "");
[10961]	220	}
[17568]	221	else {
	222	# leave html stuff in, but escape the tags
	223	&ghtml::htmlsafe($section_text);
[12844]	224	}
[17568]	225	}
	226	}
	227	else {
[20419]	228	$submeta =~ s/^ex\.//; #strip off ex.
	229
[17568]	230	# its a metadata element
	231	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	232	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	233	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	234	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
[12844]	235	}
[10961]	236	}
[17568]	237	push (@metadata_list, @section_metadata);
[10961]	238	}
[17568]	239	} # for each field in this one index
	240
	241	# now we add the text and/or metadata into new_text
	242	if ($section_text ne "" \|\| scalar(@metadata_list)) {
	243	my $new_text = "";
	244
	245	if ($section_text ne "") {
	246	$new_text .= "$section_text ";
[10961]	247	}
[17568]	248
	249	foreach my $item (@metadata_list) {
	250	&ghtml::htmlsafe($item);
	251	$new_text .= "$item ";
	252	}
	253
	254	if ($allfields_index) {
	255	$allfields_text .= $new_text;
	256	}
	257
[17797]	258	if ($self->{'indexing_text'}) {
	259	# add the tag
	260	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
	261	}
[17568]	262	# filter the text
	263	$new_text = $self->filter_text ($field, $new_text);
[18456]	264
[18471]	265	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	266	$self->{'num_processed_bytes'} += length ($new_text);
	267	$text .= "$new_text";
	268	}
	269	else {
	270	# delete
	271	$self->{'num_processed_bytes'} -= length ($new_text);
	272	}
[17568]	273
	274
[17797]	275	if ($self->{'indexing_text'} && $new_field) {
[17568]	276	# we need to add to the list in indexfields
	277
[10961]	278	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	279	$self->{'indexfieldmap'}->{$shortname} = 1;
	280	}
[17568]	281
	282	}
	283
	284	} # foreach field
[12844]	285
[17568]	286
	287	if ($all_metadata_specified) {
	288
	289	my $new_text = "";
	290	my $shortname = "";
	291	my $metadata = $doc_obj->get_all_metadata ($section);
	292	foreach my $pair (@$metadata) {
	293	my ($mfield, $mvalue) = (@$pair);
	294	# no value
	295	next unless defined $mvalue && $mvalue ne "";
	296	# we have already indexed this
	297	next if defined ($specified_fields->{$mfield});
	298	# check fields here, maybe others dont want - change to use dontindex!!
	299	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	300	next if ($mfield =~ /^gsdl/);
	301
	302	&ghtml::htmlsafe($mvalue);
	303
	304	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	305	$shortname = $self->{'indexfieldmap'}->{$mfield};
[8072]	306	}
[17568]	307	else {
	308	$shortname = $self->create_shortname($mfield);
	309	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	310	$self->{'indexfieldmap'}->{$shortname} = 1;
	311	}
	312	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
	313	if ($allfields_index) {
	314	$allfields_text .= "$mvalue ";
[10961]	315	}
[17568]	316
	317	if (!defined $self->{'indexfields'}->{$mfield}) {
	318	$self->{'indexfields'}->{$mfield} = 1;
	319	}
	320
[8072]	321	}
[10961]	322	# filter the text
[17568]	323	$new_text = $self->filter_text ("metadata", $new_text);
	324
[18471]	325	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	326	$self->{'num_processed_bytes'} += length ($new_text);
	327	$text .= "$new_text";
	328	}
	329	else {
	330	# delete
	331	$self->{'num_processed_bytes'} -= length ($new_text);
	332	}
[17568]	333	}
	334
	335	if ($allfields_index) {
	336	# add the index name mapping
	337	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	338	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	339
	340	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
	341	# filter the text
	342	$new_text = $self->filter_text ("allfields", $new_text);
	343
[18471]	344	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	345	$self->{'num_processed_bytes'} += length ($new_text);
	346	$text .= "$new_text";
	347	}
	348	else {
	349	# delete
	350	$self->{'num_processed_bytes'} -= length ($new_text);
	351	}
[17568]	352	}
[18456]	353
[16504]	354	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
[8072]	355
[12844]	356	$section = $doc_obj->get_next_section($section);
[18456]	357	} # while defined section
	358
	359	print $lucenehandle "$text\n$documentendtag";
	360
	361	## if ($edit_mode eq "delete") {
	362	## print STDERR "$text\n$documentendtag";
	363	## }
	364
[8072]	365	}
	366
[18456]	367	sub text {
	368	my $self = shift (@_);
	369	my ($doc_obj,$file) = @_;
	370
	371	$self->textedit($doc_obj,$file,"add");
	372	}
	373
	374	sub textreindex
	375	{
	376	my $self = shift (@_);
	377	my ($doc_obj,$file) = @_;
	378
[18471]	379	$self->textedit($doc_obj,$file,"update");
[18456]	380	}
	381
	382	sub textdelete
	383	{
	384	my $self = shift (@_);
	385	my ($doc_obj,$file) = @_;
	386
	387	$self->textedit($doc_obj,$file,"delete");
	388	}
	389
	390
	391
	392
	393
[12844]	394	# /** We make this builder pretend to be a document processor so we can get
	395	# * information back from the plugins.
	396	# *
	397	# * @param $self A reference to this Lucene builder
	398	# * @param $doc_obj A reference to a document object representing what was
	399	# * parsed by the GAPlug
	400	# * @param $file The name of the file parsed as a string
	401	# *
	402	# * @author John Thompson, DL Consulting Ltd
	403	# */
	404	sub process()
	405	{
	406	my $self = shift (@_);
	407	my ($doc_obj, $file) = @_;
	408
	409	# If this is called from any stage other than an incremental infodb we want
	410	# to pass through to the superclass of build
	411	if ($self->get_mode() eq "incinfodb")
	412	{
[17287]	413	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
[12844]	414	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	415	$archivedir = "" unless defined $archivedir;
	416	$archivedir =~ s/\\/\//g;
	417	$archivedir =~ s/^\/+//;
	418	$archivedir =~ s/\/+$//;
	419
	420	# Number of files
[17287]	421	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
[12844]	422
	423	# resolve the final filenames of the files associated with this document
	424	$self->assoc_files ($doc_obj, $archivedir);
	425
	426	# is this a paged or a hierarchical document
	427	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	428
	429	# Determine the actual docnum by checking if we've processed any
	430	# previous incrementally added documents. If so, carry on from there.
	431	# Otherwise we set the counter to be the same as the number of
	432	# sections encountered during the previous build
	433	if ($self->{'numincdocs'} == 0)
	434	{
	435	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
	436	}
	437
	438	my $section = $doc_obj->get_top_section ();
[17287]	439	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	440	my $doc_OID = $doc_obj->get_OID();
	441	my $url = "";
	442	while (defined $section)
	443	{
[17287]	444	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	445	# Attach all the other metadata to this document
	446	# output the fact that this document is a document (unless doctype
	447	# has been set to something else from within a plugin
	448	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	449	if (!defined $dtype \|\| $dtype !~ /\w/)
	450	{
[17287]	451	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
	452	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
[12844]	453	}
	454	# output whether this node contains text
	455	if ($doc_obj->get_text_length($section) > 0)
	456	{
	457	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
	458	}
	459	else
	460	{
	461	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
	462	}
	463
	464	# output archivedir if at top level
	465	if ($section eq $doc_obj->get_top_section())
	466	{
	467	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
	468	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
	469	}
	470
	471	# output a list of children
	472	my $children = $doc_obj->get_children ($section);
	473	if (scalar(@$children) > 0)
	474	{
	475	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
	476	my @contains = ();
	477	foreach my $child (@$children)
	478	{
	479	if ($child =~ /^.*?\.(\d+)$/)
	480	{
	481	push (@contains, "\".$1");
	482	}
	483	else
	484	{
	485	push (@contains, "\".$child");
	486	}
	487	}
	488	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
	489	}
	490	#output the matching doc number
[17287]	491	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
[12844]	492	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
	493
	494	$self->{'numincdocs'}++;
	495	$section = $doc_obj->get_next_section($section);
[15687]	496	# if no sections wanted, only add the docs
[15685]	497	last if ($self->{'db_level'} eq "document");
[12844]	498	}
[17287]	499	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
[21643]	500	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
[12844]	501	}
	502	else
	503	{
	504	$self->mgppbuildproc::process(@_);
	505	}
	506	}
	507	# / process() /
	508
[14934]	509
	510	# Following methods seem to be no different to those defined in basebuildproc.pm
	511	# From inspection, it looks like these ones can be removed
	512
	513
[12844]	514	sub get_num_docs {
	515	my $self = shift (@_);
	516	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
	517	return $self->{'num_docs'};
	518	}
	519
	520	sub get_num_sections {
	521	my $self = shift (@_);
	522	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
	523	return $self->{'num_sections'};
	524	}
	525
	526	# num_bytes is the actual number of bytes in the collection
	527	# this is normally the same as what's processed during text compression
	528	sub get_num_bytes {
	529	my $self = shift (@_);
	530	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
	531	return $self->{'num_bytes'};
	532	}
	533
[14068]	534
	535	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
	536	# Otherwise the removal of tags below might lead to Lucene turning
	537	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
	538	# (example from demo collection b20cre)
	539	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
	540	sub preprocess_text
	541	{
	542	my $self = shift (@_);
	543	my ($text, $strip_html, $para) = @_;
	544	# at this stage, we do not do paragraph tags unless have strip_html -
	545	# it will result in a huge mess of non-xml
	546	return unless $strip_html;
	547
	548	my $new_text = $text;
	549
	550	# if we have <pre> tags, we can have < > inside them, need to delete
	551	# the <> before stripping tags
	552	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	553
	554	if ($para eq "") {
	555	# just remove all tags
	556	$new_text =~ s/<[^>]*>/ /gs;
	557	} else {
	558	# strip all tags except <p> tags which get turned into $para
	559	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	560	}
	561
[14923]	562	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
	563	# may not be valid XML (eg. if HTML-only entities like   are used)
	564	$new_text =~ s/&\w{1,10};//g;
	565	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
	566	$new_text =~ s/&([^\#])/ $1/g;
	567
[14068]	568	return $new_text;
	569	}
	570
[23176]	571	sub delete_assoc_files
	572	{
	573	my $self = shift (@_);
[23182]	574	my ($archivedir, $edit_mode) = @_;
[14068]	575
[23176]	576	$self->basebuildproc::delete_assoc_files(@_);
	577
[23182]	578	if ($edit_mode eq "delete") {
	579	# if we are deleting the doc, then also delete the lucene text version
	580	my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
	581	if (-d $assoc_dir) {
	582	&util::rm_r($assoc_dir);
	583	}
	584	}
[23181]	585	}
[23176]	586
[8072]	587	1;
	588
[18456]	589

Note: See TracBrowser for help on using the repository browser.

Download in other formats: