Context Navigation

source: gsdl/trunk/perllib/lucenebuildproc.pm@ 20426

Last change on this file since 20426 was 20419, checked in by kjdon, 15 years ago
strip off ex. before retrieving metadata for indexing. ex. now valid in collect.cfg
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Rev	Line
[8072]	1	###########################################################################
	2	#
	3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package lucenebuildproc;
	27
	28	# This document processor outputs a document
	29	# for lucene to process
	30
	31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
	32
[12844]	33	use mgppbuildproc;
[9186]	34	use ghtml;
[12424]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
[8072]	37
[12424]	38
[12844]	39	use IncrementalBuildUtils;
	40
[8072]	41	sub BEGIN {
[8716]	42	@lucenebuildproc::ISA = ('mgppbuildproc');
[8072]	43	}
	44
	45
	46	sub new {
	47	my $class = shift @_;
	48	my $self = new mgppbuildproc (@_);
	49
[12844]	50	$self->{'numincdocs'} = 0;
	51
[8072]	52	return bless $self, $class;
	53	}
	54
[10304]	55
[10419]	56	sub is_incremental_capable
[10304]	57	{
	58	my $self = shift (@_);
	59
	60	# Unlike MG and MGPP, Lucene supports incremental building
	61	return 1;
	62	}
	63
	64
[18456]	65	sub textedit {
[8072]	66	my $self = shift (@_);
[18456]	67	my ($doc_obj,$file,$edit_mode) = @_;
	68
	69	my $lucenehandle = $self->{'output_handle'};
[8072]	70	my $outhandle = $self->{'outhandle'};
	71
	72	# only output this document if it is one to be indexed
	73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	74
[18456]	75	# skip this document if in "compress-text" mode and asked to delete it
	76	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
	77
[10961]	78	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[8072]	79
	80	# this is another document
[18471]	81	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	82	$self->{'num_docs'} += 1;
	83	}
	84	else {
	85	$self->{'num_docs'} -= 1;
	86	}
[8072]	87
	88	# get the parameters for the output
	89	# split on : just in case there is subcoll and lang stuff
	90	my ($fields) = split (/:/, $self->{'index'});
	91
[16504]	92	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
[8072]	93
	94	my $levels = $self->{'levels'};
	95	my $ldoc_level = $levels->{'document'};
	96	my $lsec_level = $levels->{'section'};
	97
[18456]	98	# gs2_id should be depricated #####
[16431]	99	my $gs2_id = "";
[12844]	100	if ($ldoc_level)
[16431]	101	{
[15685]	102	if ($self->{'db_level'} eq 'document')
[16431]	103	{
	104	$gs2_id = $self->{'num_docs'};
	105	}
[12844]	106	else
[16431]	107	{
[9214]	108	# default is section level
[16431]	109	$gs2_id = $self->{'num_sections'} + 1;
	110	}
	111	}
	112	my $gs2_docOID = $doc_obj->get_OID();
[18456]	113	my $documenttag = undef;
	114	my $documentendtag = undef;
[12844]	115
[18456]	116	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
	117	$documentendtag = "\n</$doc_tag_name>\n";
	118
[16504]	119	my $sec_tag_name = "";
[12844]	120	if ($lsec_level)
[17568]	121	{
[16504]	122	$sec_tag_name = $mgppbuildproc::level_map{'section'};
[17568]	123	}
[12844]	124
[8072]	125	my $doc_section = 0; # just for this document
	126
	127	my $text = "";
	128	$text .= $documenttag;
	129	# get the text for this document
	130	my $section = $doc_obj->get_top_section();
[12844]	131	while (defined $section)
[17568]	132	{
[8072]	133	# update a few statistics
	134	$doc_section++;
[12844]	135	$self->{'num_sections'}++;
	136
[18456]	137	my $sec_gs2_id = $self->{'num_sections'};
	138	my $sec_gs2_docOID = $gs2_docOID;
	139	$sec_gs2_docOID .= ".$section" if ($section ne "");
[8072]	140
[9178]	141	# if we are doing subcollections, then some docs shouldn't be indexed.
[12844]	142	# but we need to put the section tag placeholders in there so the
[15687]	143	# sections match up with database
[12274]	144	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
[12951]	145	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
[18456]	146	if ($sec_tag_name ne "") {
	147	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
	148	$text .= "\n</$sec_tag_name>\n"
	149	}
[12844]	150	$section = $doc_obj->get_next_section($section);
[10961]	151	next;
[12844]	152	}
	153
[18456]	154	if ($sec_tag_name ne "")
	155	{
	156	$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
	157	}
[17568]	158
[18471]	159	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	160	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	161	}
	162	else {
	163	# delete
	164	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	165	}
	166
	167
[17568]	168	# has the user added a 'metadata' index?
	169	my $all_metadata_specified = 0;
	170	# which fields have already been indexed? (same as fields, but in a map)
	171	my $specified_fields = {};
	172
	173	# do we have an allfields index??
	174	my $allfields_index = 0;
	175	# collect up all the text for it in here
	176	my $allfields_text = "";
	177	foreach my $field (split (/;/, $fields)) {
	178	if ($field eq "allfields") {
	179	$allfields_index = 1;
	180	} elsif ($field eq "metadata") {
	181	$all_metadata_specified = 1;
	182	}
	183	}
	184
	185	foreach my $field (split (/;/, $fields)) {
	186
[10961]	187	# only deal with this field if it doesn't start with top or
	188	# this is the first section
	189	my $real_field = $field;
	190	next if (($real_field =~ s/^top//) && ($doc_section != 1));
[17568]	191
	192	# process these two later
	193	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	194
	195	#individual metadata and or text specified - could be a comma separated list
	196	$specified_fields->{$real_field} = 1;
	197	my $shortname="";
	198	my $new_field = 0; # have we found a new field name?
	199	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	200	$shortname = $self->{'indexfieldmap'}->{$real_field};
[12844]	201	}
[17568]	202	else {
	203	$shortname = $self->create_shortname($real_field);
	204	$new_field = 1;
	205	}
[12844]	206
[17568]	207	my @metadata_list = (); # put any metadata values in here
	208	my $section_text = ""; # put the text in here
	209	foreach my $submeta (split /,/, $real_field) {
	210	if ($submeta eq "text") {
	211	# no point in indexing text more than once
	212	if ($section_text eq "") {
	213	$section_text = $doc_obj->get_text($section);
	214	if ($self->{'indexing_text'}) {
	215	# we always strip html
	216	$section_text = $self->preprocess_text($section_text, 1, "");
[10961]	217	}
[17568]	218	else {
	219	# leave html stuff in, but escape the tags
	220	&ghtml::htmlsafe($section_text);
[12844]	221	}
[17568]	222	}
	223	}
	224	else {
[20419]	225	$submeta =~ s/^ex\.//; #strip off ex.
	226
[17568]	227	# its a metadata element
	228	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	229	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	230	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	231	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
[12844]	232	}
[10961]	233	}
[17568]	234	push (@metadata_list, @section_metadata);
[10961]	235	}
[17568]	236	} # for each field in this one index
	237
	238	# now we add the text and/or metadata into new_text
	239	if ($section_text ne "" \|\| scalar(@metadata_list)) {
	240	my $new_text = "";
	241
	242	if ($section_text ne "") {
	243	$new_text .= "$section_text ";
[10961]	244	}
[17568]	245
	246	foreach my $item (@metadata_list) {
	247	&ghtml::htmlsafe($item);
	248	$new_text .= "$item ";
	249	}
	250
	251	if ($allfields_index) {
	252	$allfields_text .= $new_text;
	253	}
	254
[17797]	255	if ($self->{'indexing_text'}) {
	256	# add the tag
	257	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
	258	}
[17568]	259	# filter the text
	260	$new_text = $self->filter_text ($field, $new_text);
[18456]	261
[18471]	262	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	263	$self->{'num_processed_bytes'} += length ($new_text);
	264	$text .= "$new_text";
	265	}
	266	else {
	267	# delete
	268	$self->{'num_processed_bytes'} -= length ($new_text);
	269	}
[17568]	270
	271
[17797]	272	if ($self->{'indexing_text'} && $new_field) {
[17568]	273	# we need to add to the list in indexfields
	274
[10961]	275	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	276	$self->{'indexfieldmap'}->{$shortname} = 1;
	277	}
[17568]	278
	279	}
	280
	281	} # foreach field
[12844]	282
[17568]	283
	284	if ($all_metadata_specified) {
	285
	286	my $new_text = "";
	287	my $shortname = "";
	288	my $metadata = $doc_obj->get_all_metadata ($section);
	289	foreach my $pair (@$metadata) {
	290	my ($mfield, $mvalue) = (@$pair);
	291	# no value
	292	next unless defined $mvalue && $mvalue ne "";
	293	# we have already indexed this
	294	next if defined ($specified_fields->{$mfield});
	295	# check fields here, maybe others dont want - change to use dontindex!!
	296	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	297	next if ($mfield =~ /^gsdl/);
	298
	299	&ghtml::htmlsafe($mvalue);
	300
	301	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	302	$shortname = $self->{'indexfieldmap'}->{$mfield};
[8072]	303	}
[17568]	304	else {
	305	$shortname = $self->create_shortname($mfield);
	306	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	307	$self->{'indexfieldmap'}->{$shortname} = 1;
	308	}
	309	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
	310	if ($allfields_index) {
	311	$allfields_text .= "$mvalue ";
[10961]	312	}
[17568]	313
	314	if (!defined $self->{'indexfields'}->{$mfield}) {
	315	$self->{'indexfields'}->{$mfield} = 1;
	316	}
	317
[8072]	318	}
[10961]	319	# filter the text
[17568]	320	$new_text = $self->filter_text ("metadata", $new_text);
	321
[18471]	322	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	323	$self->{'num_processed_bytes'} += length ($new_text);
	324	$text .= "$new_text";
	325	}
	326	else {
	327	# delete
	328	$self->{'num_processed_bytes'} -= length ($new_text);
	329	}
[17568]	330	}
	331
	332	if ($allfields_index) {
	333	# add the index name mapping
	334	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
	335	$self->{'indexfieldmap'}->{"ZZ"} = 1;
	336
	337	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
	338	# filter the text
	339	$new_text = $self->filter_text ("allfields", $new_text);
	340
[18471]	341	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	342	$self->{'num_processed_bytes'} += length ($new_text);
	343	$text .= "$new_text";
	344	}
	345	else {
	346	# delete
	347	$self->{'num_processed_bytes'} -= length ($new_text);
	348	}
[17568]	349	}
[18456]	350
[16504]	351	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
[8072]	352
[12844]	353	$section = $doc_obj->get_next_section($section);
[18456]	354	} # while defined section
	355
	356	print $lucenehandle "$text\n$documentendtag";
	357
	358	## if ($edit_mode eq "delete") {
	359	## print STDERR "$text\n$documentendtag";
	360	## }
	361
[8072]	362	}
	363
[18456]	364	sub text {
	365	my $self = shift (@_);
	366	my ($doc_obj,$file) = @_;
	367
	368	$self->textedit($doc_obj,$file,"add");
	369	}
	370
	371	sub textreindex
	372	{
	373	my $self = shift (@_);
	374	my ($doc_obj,$file) = @_;
	375
[18471]	376	$self->textedit($doc_obj,$file,"update");
[18456]	377	}
	378
	379	sub textdelete
	380	{
	381	my $self = shift (@_);
	382	my ($doc_obj,$file) = @_;
	383
	384	$self->textedit($doc_obj,$file,"delete");
	385	}
	386
	387
	388
	389
	390
[12844]	391	# /** We make this builder pretend to be a document processor so we can get
	392	# * information back from the plugins.
	393	# *
	394	# * @param $self A reference to this Lucene builder
	395	# * @param $doc_obj A reference to a document object representing what was
	396	# * parsed by the GAPlug
	397	# * @param $file The name of the file parsed as a string
	398	# *
	399	# * @author John Thompson, DL Consulting Ltd
	400	# */
	401	sub process()
	402	{
	403	my $self = shift (@_);
	404	my ($doc_obj, $file) = @_;
	405
	406	# If this is called from any stage other than an incremental infodb we want
	407	# to pass through to the superclass of build
	408	if ($self->get_mode() eq "incinfodb")
	409	{
[17287]	410	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
[12844]	411	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	412	$archivedir = "" unless defined $archivedir;
	413	$archivedir =~ s/\\/\//g;
	414	$archivedir =~ s/^\/+//;
	415	$archivedir =~ s/\/+$//;
	416
	417	# Number of files
[17287]	418	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
[12844]	419
	420	# resolve the final filenames of the files associated with this document
	421	$self->assoc_files ($doc_obj, $archivedir);
	422
	423	# is this a paged or a hierarchical document
	424	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	425
	426	# Determine the actual docnum by checking if we've processed any
	427	# previous incrementally added documents. If so, carry on from there.
	428	# Otherwise we set the counter to be the same as the number of
	429	# sections encountered during the previous build
	430	if ($self->{'numincdocs'} == 0)
	431	{
	432	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
	433	}
	434
	435	my $section = $doc_obj->get_top_section ();
[17287]	436	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	437	my $doc_OID = $doc_obj->get_OID();
	438	my $url = "";
	439	while (defined $section)
	440	{
[17287]	441	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	442	# Attach all the other metadata to this document
	443	# output the fact that this document is a document (unless doctype
	444	# has been set to something else from within a plugin
	445	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	446	if (!defined $dtype \|\| $dtype !~ /\w/)
	447	{
[17287]	448	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
	449	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
[12844]	450	}
	451	# output whether this node contains text
	452	if ($doc_obj->get_text_length($section) > 0)
	453	{
	454	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
	455	}
	456	else
	457	{
	458	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
	459	}
	460
	461	# output archivedir if at top level
	462	if ($section eq $doc_obj->get_top_section())
	463	{
	464	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
	465	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
	466	}
	467
	468	# output a list of children
	469	my $children = $doc_obj->get_children ($section);
	470	if (scalar(@$children) > 0)
	471	{
	472	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
	473	my @contains = ();
	474	foreach my $child (@$children)
	475	{
	476	if ($child =~ /^.*?\.(\d+)$/)
	477	{
	478	push (@contains, "\".$1");
	479	}
	480	else
	481	{
	482	push (@contains, "\".$child");
	483	}
	484	}
	485	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
	486	}
	487	#output the matching doc number
[17287]	488	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
[12844]	489	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
	490
	491	$self->{'numincdocs'}++;
	492	$section = $doc_obj->get_next_section($section);
[15687]	493	# if no sections wanted, only add the docs
[15685]	494	last if ($self->{'db_level'} eq "document");
[12844]	495	}
[17287]	496	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
[12844]	497	&IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
	498	}
	499	else
	500	{
	501	$self->mgppbuildproc::process(@_);
	502	}
	503	}
	504	# / process() /
	505
[14934]	506
	507	# Following methods seem to be no different to those defined in basebuildproc.pm
	508	# From inspection, it looks like these ones can be removed
	509
	510
[12844]	511	sub get_num_docs {
	512	my $self = shift (@_);
	513	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
	514	return $self->{'num_docs'};
	515	}
	516
	517	sub get_num_sections {
	518	my $self = shift (@_);
	519	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
	520	return $self->{'num_sections'};
	521	}
	522
	523	# num_bytes is the actual number of bytes in the collection
	524	# this is normally the same as what's processed during text compression
	525	sub get_num_bytes {
	526	my $self = shift (@_);
	527	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
	528	return $self->{'num_bytes'};
	529	}
	530
[14068]	531
	532	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
	533	# Otherwise the removal of tags below might lead to Lucene turning
	534	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
	535	# (example from demo collection b20cre)
	536	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
	537	sub preprocess_text
	538	{
	539	my $self = shift (@_);
	540	my ($text, $strip_html, $para) = @_;
	541	# at this stage, we do not do paragraph tags unless have strip_html -
	542	# it will result in a huge mess of non-xml
	543	return unless $strip_html;
	544
	545	my $new_text = $text;
	546
	547	# if we have <pre> tags, we can have < > inside them, need to delete
	548	# the <> before stripping tags
	549	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	550
	551	if ($para eq "") {
	552	# just remove all tags
	553	$new_text =~ s/<[^>]*>/ /gs;
	554	} else {
	555	# strip all tags except <p> tags which get turned into $para
	556	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	557	}
	558
[14923]	559	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
	560	# may not be valid XML (eg. if HTML-only entities like   are used)
	561	$new_text =~ s/&\w{1,10};//g;
	562	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
	563	$new_text =~ s/&([^\#])/ $1/g;
	564
[14068]	565	return $new_text;
	566	}
	567
	568
[8072]	569	1;
	570
[18456]	571

Note: See TracBrowser for help on using the repository browser.

Download in other formats: