Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 30602

Last change on this file since 30602 was 28566, checked in by kjdon, 11 years ago
changed some util:: to FIleUtils:: methods to avoid warnings
Property svn:keywords set to `Author Date Id Revision`
File size: 21.4 KB

Rev	Line
[8072]	1	###########################################################################
	2	#
	3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	package lucenebuildproc;
	27
	28	# This document processor outputs a document
	29	# for lucene to process
	30
	31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
	32
[12844]	33	use mgppbuildproc;
[9186]	34	use ghtml;
[12424]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
[8072]	37
[12424]	38
[12844]	39	use IncrementalBuildUtils;
[28566]	40	use FileUtils;
[12844]	41
[8072]	42	sub BEGIN {
[8716]	43	@lucenebuildproc::ISA = ('mgppbuildproc');
[8072]	44	}
	45
	46
	47	sub new {
	48	my $class = shift @_;
	49	my $self = new mgppbuildproc (@_);
	50
[12844]	51	$self->{'numincdocs'} = 0;
[27358]	52	$self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
	53	$self->{'allfields_index'} = 0; # do we need allfields index?
	54	$self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
	55	$self->{'actualsortfields'} = {}; # sort fields that have actually been used
	56	$self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
[8072]	57	return bless $self, $class;
	58	}
	59
[27358]	60	sub set_index {
	61	my $self = shift (@_);
	62	my ($index, $indexexparr) = @_;
[10304]	63
[27358]	64	$self->mgppbuildproc::set_index($index, $indexexparr);
	65
	66	# just get the list of index fields without any subcoll stuff
	67	my ($fields) = split (/:/, $self->{'index'});
	68
	69	foreach my $field (split (/;/, $fields)) {
	70	if ($field eq "allfields") {
	71	$self->{'allfields_index'} = 1;
	72	} elsif ($field eq "metadata") {
	73	$self->{'all_metadata_specified'} = 1;
	74	} else {
	75	$field =~ s/^top//;
	76	$self->{'specified_fields'} ->{$field} = 1;
	77	}
	78	}
	79	}
	80
[27563]	81	sub set_sections_sort_on_document_metadata {
	82	my $self= shift (@_);
	83	my ($index_type) = @_;
	84
	85	$self->{'sections_sort_on_document_metadata'} = $index_type;
	86	}
	87
[27358]	88	sub set_sortfields {
	89	my $self = shift (@_);
	90
[27565]	91	my ($sortfields) = @_;
	92	$self->{'sortfields'} = ();
	93	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
	94	foreach my $s (@$sortfields) {
	95	if ($s !~ /^(text\|allfields\|metadata)$/) {
	96	push (@{$self->{'sortfields'}}, $s);
	97	}
	98	}
[27358]	99	}
	100
[10419]	101	sub is_incremental_capable
[10304]	102	{
	103	my $self = shift (@_);
	104
	105	# Unlike MG and MGPP, Lucene supports incremental building
	106	return 1;
	107	}
	108
	109
[18456]	110	sub textedit {
[8072]	111	my $self = shift (@_);
[18456]	112	my ($doc_obj,$file,$edit_mode) = @_;
	113
	114	my $lucenehandle = $self->{'output_handle'};
[8072]	115	my $outhandle = $self->{'outhandle'};
	116
	117	# only output this document if it is one to be indexed
	118	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	119
[18456]	120	# skip this document if in "compress-text" mode and asked to delete it
	121	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
	122
[27358]	123	# 0/1 to indicate whether this doc is part of the specified subcollection
[10961]	124	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[8072]	125
	126	# this is another document
[18471]	127	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	128	$self->{'num_docs'} += 1;
	129	}
	130	else {
	131	$self->{'num_docs'} -= 1;
	132	}
[8072]	133
[27358]	134
[8072]	135	# get the parameters for the output
	136	# split on : just in case there is subcoll and lang stuff
	137	my ($fields) = split (/:/, $self->{'index'});
	138
[16504]	139	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
[8072]	140
	141	my $levels = $self->{'levels'};
	142	my $ldoc_level = $levels->{'document'};
	143	my $lsec_level = $levels->{'section'};
	144
[16431]	145	my $gs2_docOID = $doc_obj->get_OID();
[18456]	146	my $documenttag = undef;
	147	my $documentendtag = undef;
[12844]	148
[20732]	149	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
[18456]	150	$documentendtag = "\n</$doc_tag_name>\n";
	151
[16504]	152	my $sec_tag_name = "";
[12844]	153	if ($lsec_level)
[17568]	154	{
[16504]	155	$sec_tag_name = $mgppbuildproc::level_map{'section'};
[17568]	156	}
[12844]	157
[8072]	158	my $doc_section = 0; # just for this document
	159
	160	my $text = "";
	161	$text .= $documenttag;
	162	# get the text for this document
	163	my $section = $doc_obj->get_top_section();
[12844]	164	while (defined $section)
[17568]	165	{
[8072]	166	# update a few statistics
	167	$doc_section++;
[12844]	168	$self->{'num_sections'}++;
	169
[18456]	170	my $sec_gs2_id = $self->{'num_sections'};
	171	my $sec_gs2_docOID = $gs2_docOID;
	172	$sec_gs2_docOID .= ".$section" if ($section ne "");
[8072]	173
[9178]	174	# if we are doing subcollections, then some docs shouldn't be indexed.
[12844]	175	# but we need to put the section tag placeholders in there so the
[15687]	176	# sections match up with database
[12274]	177	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
[12951]	178	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
[18456]	179	if ($sec_tag_name ne "") {
[20732]	180	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
[18456]	181	$text .= "\n</$sec_tag_name>\n"
	182	}
[12844]	183	$section = $doc_obj->get_next_section($section);
[10961]	184	next;
[12844]	185	}
	186
[18456]	187	if ($sec_tag_name ne "")
	188	{
[20732]	189	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
[18456]	190	}
[17568]	191
[18471]	192	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	193	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	194	}
	195	else {
	196	# delete
	197	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	198	}
	199
	200
[27358]	201	# collect up all the text for allfields index in here (if there is one)
[17568]	202	my $allfields_text = "";
[27358]	203
[17568]	204	foreach my $field (split (/;/, $fields)) {
	205
[10961]	206	# only deal with this field if it doesn't start with top or
	207	# this is the first section
	208	my $real_field = $field;
	209	next if (($real_field =~ s/^top//) && ($doc_section != 1));
[17568]	210
	211	# process these two later
	212	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
	213
	214	#individual metadata and or text specified - could be a comma separated list
[27358]	215	#$specified_fields->{$real_field} = 1;
[17568]	216	my $shortname="";
	217	my $new_field = 0; # have we found a new field name?
[27329]	218	if (defined $self->{'fieldnamemap'}->{$real_field}) {
	219	$shortname = $self->{'fieldnamemap'}->{$real_field};
	220	} else {
[17568]	221	$shortname = $self->create_shortname($real_field);
[27329]	222	$self->{'fieldnamemap'}->{$real_field} = $shortname;
	223	$self->{'fieldnamemap'}->{$shortname} = 1;
[17568]	224	}
	225	my @metadata_list = (); # put any metadata values in here
	226	my $section_text = ""; # put the text in here
	227	foreach my $submeta (split /,/, $real_field) {
	228	if ($submeta eq "text") {
	229	# no point in indexing text more than once
	230	if ($section_text eq "") {
	231	$section_text = $doc_obj->get_text($section);
	232	if ($self->{'indexing_text'}) {
	233	# we always strip html
	234	$section_text = $self->preprocess_text($section_text, 1, "");
[10961]	235	}
[17568]	236	else {
	237	# leave html stuff in, but escape the tags
	238	&ghtml::htmlsafe($section_text);
[12844]	239	}
[17568]	240	}
	241	}
	242	else {
[24404]	243	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
[20419]	244
[17568]	245	# its a metadata element
	246	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	247	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	248	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	249	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
[12844]	250	}
[10961]	251	}
[17568]	252	push (@metadata_list, @section_metadata);
[10961]	253	}
[17568]	254	} # for each field in this one index
	255
[24460]	256
[17568]	257	# now we add the text and/or metadata into new_text
	258	if ($section_text ne "" \|\| scalar(@metadata_list)) {
	259	my $new_text = "";
	260
	261	if ($section_text ne "") {
	262	$new_text .= "$section_text ";
[10961]	263	}
[17568]	264
	265	foreach my $item (@metadata_list) {
	266	&ghtml::htmlsafe($item);
	267	$new_text .= "$item ";
	268	}
	269
[27358]	270	if ($self->{'allfields_index'}) {
[17568]	271	$allfields_text .= $new_text;
	272	}
	273
[17797]	274	if ($self->{'indexing_text'}) {
	275	# add the tag
	276	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
[27329]	277	$self->{'allindexfields'}->{$real_field} = 1;
[17797]	278	}
[17568]	279	# filter the text
	280	$new_text = $self->filter_text ($field, $new_text);
[18456]	281
[18471]	282	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	283	$self->{'num_processed_bytes'} += length ($new_text);
	284	$text .= "$new_text";
	285	}
	286	else {
	287	# delete
	288	$self->{'num_processed_bytes'} -= length ($new_text);
[27329]	289	}
[17568]	290	}
	291
	292	} # foreach field
[12844]	293
[27358]	294	if ($self->{'all_metadata_specified'}) {
[17568]	295
	296	my $new_text = "";
	297	my $shortname = "";
	298	my $metadata = $doc_obj->get_all_metadata ($section);
	299	foreach my $pair (@$metadata) {
	300	my ($mfield, $mvalue) = (@$pair);
	301	# no value
	302	next unless defined $mvalue && $mvalue ne "";
	303	# we have already indexed this
[27358]	304	next if defined ($self->{'specified_fields'}->{$mfield});
[17568]	305	# check fields here, maybe others dont want - change to use dontindex!!
	306	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	307	next if ($mfield =~ /^gsdl/);
	308
	309	&ghtml::htmlsafe($mvalue);
	310
[27329]	311	if (defined $self->{'fieldnamemap'}->{$mfield}) {
	312	$shortname = $self->{'fieldnamemap'}->{$mfield};
[8072]	313	}
[17568]	314	else {
	315	$shortname = $self->create_shortname($mfield);
[27329]	316	$self->{'fieldnamemap'}->{$mfield} = $shortname;
	317	$self->{'fieldnamemap'}->{$shortname} = 1;
	318	}
	319	$self->{'allindexfields'}->{$mfield} = 1;
[17568]	320	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
[27358]	321	if ($self->{'allfields_index'}) {
[17568]	322	$allfields_text .= "$mvalue ";
[10961]	323	}
[17568]	324
[27329]	325	if (!defined $self->{'extraindexfields'}->{$mfield}) {
	326	$self->{'extraindexfields'}->{$mfield} = 1;
[17568]	327	}
	328
[8072]	329	}
[10961]	330	# filter the text
[17568]	331	$new_text = $self->filter_text ("metadata", $new_text);
	332
[18471]	333	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	334	$self->{'num_processed_bytes'} += length ($new_text);
	335	$text .= "$new_text";
	336	}
	337	else {
	338	# delete
	339	$self->{'num_processed_bytes'} -= length ($new_text);
	340	}
[17568]	341	}
	342
[27358]	343	if ($self->{'allfields_index'}) {
[17568]	344
	345	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
	346	# filter the text
	347	$new_text = $self->filter_text ("allfields", $new_text);
	348
[18471]	349	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	350	$self->{'num_processed_bytes'} += length ($new_text);
	351	$text .= "$new_text";
	352	}
	353	else {
	354	# delete
	355	$self->{'num_processed_bytes'} -= length ($new_text);
	356	}
[17568]	357	}
[27358]	358	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
	359	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
	360	# add sort fields if there are any
[18456]	361
[27358]	362	foreach my $sfield (@{$self->{'sortfields'}}) {
[27565]	363	# ignore special field rank
[28035]	364	next if ($sfield eq "rank" \|\| $sfield eq "none");
[27358]	365	my $sf_shortname;
	366	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
	367	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
	368	}
	369	else {
	370	$sf_shortname = $self->create_sortfield_shortname($sfield);
	371	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
	372	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
	373	}
	374	my @metadata_list = (); # put any metadata values in here
	375	foreach my $submeta (split /,/, $sfield) {
	376	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
	377
	378	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
[27563]	379	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
	380	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
	381	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	382	}
	383	}
[27358]	384	push (@metadata_list, @section_metadata);
	385	}
	386	my $new_text = "";
	387	foreach my $item (@metadata_list) {
	388	&ghtml::htmlsafe($item);
[27563]	389	$new_text .= "$item";
[27358]	390	}
	391	if ($new_text =~ /\S/) {
	392	$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
	393	# filter the text???
	394	$text .= "$new_text"; # add it to the main text block
	395	$self->{'actualsortfields'}->{$sfield} = 1;
	396	}
	397	}
	398	}
[16504]	399	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
[8072]	400
[12844]	401	$section = $doc_obj->get_next_section($section);
[27358]	402	} # for each section
	403
	404	#open (TEXTOUT, ">text.out");
	405	#print TEXTOUT "$text\n$documentendtag";
	406	#close TEXTOUT;
[18456]	407
	408	print $lucenehandle "$text\n$documentendtag";
	409
	410	## if ($edit_mode eq "delete") {
	411	## print STDERR "$text\n$documentendtag";
	412	## }
	413
[8072]	414	}
	415
[18456]	416	sub text {
	417	my $self = shift (@_);
	418	my ($doc_obj,$file) = @_;
	419
	420	$self->textedit($doc_obj,$file,"add");
	421	}
	422
	423	sub textreindex
	424	{
	425	my $self = shift (@_);
	426	my ($doc_obj,$file) = @_;
	427
[18471]	428	$self->textedit($doc_obj,$file,"update");
[18456]	429	}
	430
	431	sub textdelete
	432	{
	433	my $self = shift (@_);
	434	my ($doc_obj,$file) = @_;
	435
	436	$self->textedit($doc_obj,$file,"delete");
	437	}
	438
	439
	440
	441
	442
[12844]	443	# /** We make this builder pretend to be a document processor so we can get
	444	# * information back from the plugins.
	445	# *
	446	# * @param $self A reference to this Lucene builder
	447	# * @param $doc_obj A reference to a document object representing what was
	448	# * parsed by the GAPlug
	449	# * @param $file The name of the file parsed as a string
	450	# *
	451	# * @author John Thompson, DL Consulting Ltd
	452	# */
	453	sub process()
	454	{
	455	my $self = shift (@_);
	456	my ($doc_obj, $file) = @_;
	457
	458	# If this is called from any stage other than an incremental infodb we want
	459	# to pass through to the superclass of build
	460	if ($self->get_mode() eq "incinfodb")
	461	{
[17287]	462	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
[12844]	463	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	464	$archivedir = "" unless defined $archivedir;
	465	$archivedir =~ s/\\/\//g;
	466	$archivedir =~ s/^\/+//;
	467	$archivedir =~ s/\/+$//;
	468
	469	# Number of files
[17287]	470	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
[12844]	471
	472	# resolve the final filenames of the files associated with this document
	473	$self->assoc_files ($doc_obj, $archivedir);
	474
	475	# is this a paged or a hierarchical document
	476	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	477
	478	# Determine the actual docnum by checking if we've processed any
	479	# previous incrementally added documents. If so, carry on from there.
	480	# Otherwise we set the counter to be the same as the number of
	481	# sections encountered during the previous build
	482	if ($self->{'numincdocs'} == 0)
	483	{
	484	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
	485	}
	486
	487	my $section = $doc_obj->get_top_section ();
[17287]	488	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	489	my $doc_OID = $doc_obj->get_OID();
	490	my $url = "";
	491	while (defined $section)
	492	{
[17287]	493	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
[12844]	494	# Attach all the other metadata to this document
	495	# output the fact that this document is a document (unless doctype
	496	# has been set to something else from within a plugin
	497	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	498	if (!defined $dtype \|\| $dtype !~ /\w/)
	499	{
[17287]	500	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
	501	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
[12844]	502	}
	503	# output whether this node contains text
	504	if ($doc_obj->get_text_length($section) > 0)
	505	{
	506	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
	507	}
	508	else
	509	{
	510	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
	511	}
	512
	513	# output archivedir if at top level
	514	if ($section eq $doc_obj->get_top_section())
	515	{
	516	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
	517	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
	518	}
	519
	520	# output a list of children
	521	my $children = $doc_obj->get_children ($section);
	522	if (scalar(@$children) > 0)
	523	{
	524	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
	525	my @contains = ();
	526	foreach my $child (@$children)
	527	{
	528	if ($child =~ /^.*?\.(\d+)$/)
	529	{
	530	push (@contains, "\".$1");
	531	}
	532	else
	533	{
	534	push (@contains, "\".$child");
	535	}
	536	}
	537	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
	538	}
	539	#output the matching doc number
[17287]	540	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
[12844]	541	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
	542
	543	$self->{'numincdocs'}++;
	544	$section = $doc_obj->get_next_section($section);
[15687]	545	# if no sections wanted, only add the docs
[15685]	546	last if ($self->{'db_level'} eq "document");
[12844]	547	}
[17287]	548	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
[21643]	549	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
[12844]	550	}
	551	else
	552	{
	553	$self->mgppbuildproc::process(@_);
	554	}
	555	}
	556	# / process() /
	557
[14934]	558
	559	# Following methods seem to be no different to those defined in basebuildproc.pm
	560	# From inspection, it looks like these ones can be removed
	561
	562
[12844]	563	sub get_num_docs {
	564	my $self = shift (@_);
	565	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
	566	return $self->{'num_docs'};
	567	}
	568
	569	sub get_num_sections {
	570	my $self = shift (@_);
	571	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
	572	return $self->{'num_sections'};
	573	}
	574
	575	# num_bytes is the actual number of bytes in the collection
	576	# this is normally the same as what's processed during text compression
	577	sub get_num_bytes {
	578	my $self = shift (@_);
	579	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
	580	return $self->{'num_bytes'};
	581	}
	582
[14068]	583
	584	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
	585	# Otherwise the removal of tags below might lead to Lucene turning
	586	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
	587	# (example from demo collection b20cre)
	588	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
	589	sub preprocess_text
	590	{
	591	my $self = shift (@_);
	592	my ($text, $strip_html, $para) = @_;
	593	# at this stage, we do not do paragraph tags unless have strip_html -
	594	# it will result in a huge mess of non-xml
	595	return unless $strip_html;
	596
	597	my $new_text = $text;
	598
	599	# if we have <pre> tags, we can have < > inside them, need to delete
	600	# the <> before stripping tags
	601	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	602
	603	if ($para eq "") {
	604	# just remove all tags
	605	$new_text =~ s/<[^>]*>/ /gs;
	606	} else {
	607	# strip all tags except <p> tags which get turned into $para
	608	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	609	}
	610
[14923]	611	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
	612	# may not be valid XML (eg. if HTML-only entities like   are used)
	613	$new_text =~ s/&\w{1,10};//g;
	614	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
	615	$new_text =~ s/&([^\#])/ $1/g;
	616
[14068]	617	return $new_text;
	618	}
	619
[23176]	620	sub delete_assoc_files
	621	{
	622	my $self = shift (@_);
[23182]	623	my ($archivedir, $edit_mode) = @_;
[14068]	624
[23176]	625	$self->basebuildproc::delete_assoc_files(@_);
	626
[23182]	627	if ($edit_mode eq "delete") {
	628	# if we are deleting the doc, then also delete the lucene text version
[28566]	629	my $assoc_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"text", $archivedir);
[23182]	630	if (-d $assoc_dir) {
[28566]	631	&FileUtils::removeFilesRecursive($assoc_dir);
[23182]	632	}
	633	}
[23181]	634	}
[27358]	635
	636	sub create_sortfield_shortname {
	637	my $self = shift(@_);
	638
	639	my ($realname) = @_;
	640
	641	my $index_shortname;
	642	# if we have created a shortname for an index on this field, then use it.
	643	if (defined $self->{'fieldnamemap'}->{$realname}) {
	644	$index_shortname = $self->{'fieldnamemap'}->{$realname};
	645	} else {
	646	$index_shortname = $self->create_shortname($realname);
	647	}
	648	return "by".$index_shortname;
	649	}
	650
	651
[8072]	652	1;
	653
[18456]	654

Note: See TracBrowser for help on using the repository browser.

Download in other formats: