Context Navigation

source: trunk/gsdl/perllib/mgppbuildproc.pm@ 7645

Last change on this file since 7645 was 7090, checked in by sjboddie, 20 years ago
Fixed some minor perl syntax problems that were causing warning messages when building mgpp collections.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.4 KB

Rev	Line
[932]	1	###########################################################################
	2	#
[1852]	3	# mgppbuildproc.pm --
[932]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document
[1852]	27	# for mgpp to process
[932]	28
	29
	30	package mgppbuildproc;
	31
[3834]	32	eval {require bytes};
[3767]	33
[932]	34	use classify;
	35	use doc;
	36	use docproc;
	37	use util;
	38
	39
	40	BEGIN {
	41	@ISA = ('docproc');
	42	}
	43
[4811]	44	#this must be the same as in mgppbuilder
	45	%level_map = ('document'=>'Doc',
	46	'section'=>'Sec',
	47	'paragraph'=>'Para');
[932]	48
	49	sub new {
[1694]	50	my ($class, $collection, $source_dir, $build_dir,
	51	$verbosity, $outhandle) = @_;
[932]	52	my $self = new docproc ();
	53
[1694]	54	# outhandle is where all the debugging info goes
	55	# output_handle is where the output of the plugins is piped
	56	# to (i.e. mg, gdbm etc.)
	57	$outhandle = STDERR unless defined $outhandle;
	58
[932]	59	$self->{'collection'} = $collection;
	60	$self->{'source_dir'} = $source_dir;
	61	$self->{'build_dir'} = $build_dir;
	62	$self->{'verbosity'} = $verbosity;
	63	$self->{'classifiers'} = [];
	64	$self->{'mode'} = "text";
	65	$self->{'assocdir'} = $build_dir;
	66	$self->{'dontgdbm'} = {};
	67	$self->{'index'} = "text";
	68	$self->{'indexexparr'} = [];
	69	$self->{'output_handle'} = "STDOUT";
	70	$self->{'num_docs'} = 0;
	71	$self->{'num_sections'} = 0;
	72	$self->{'num_bytes'} = 0;
[1694]	73	$self->{'num_processed_bytes'} = 0;
[2771]	74	$self->{'store_text'} = 1;
[1694]	75	$self->{'outhandle'} = $outhandle;
[2771]	76
	77	#used by browse interface
	78	$self->{'doclist'} = [];
	79
	80	$self->{'indexing_text'} = 0;
	81
	82	#new ones for mgpp
[1852]	83	$self->{'dontindex'} = {};
	84	$self->{'indexfieldmap'} = {};
[4769]	85	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
[1852]	86	$self->{'strip_html'}=1;
[932]	87
[1852]	88
[932]	89	return bless $self, $class;
	90	}
	91
	92	sub reset {
	93	my $self = shift (@_);
	94
	95	$self->{'num_docs'} = 0;
	96	$self->{'num_sections'} = 0;
[1694]	97	$self->{'num_processed_bytes'} = 0;
[932]	98	$self->{'num_bytes'} = 0;
	99	}
	100
	101	sub get_num_docs {
	102	my $self = shift (@_);
	103
	104	return $self->{'num_docs'};
	105	}
	106
	107	sub get_num_sections {
	108	my $self = shift (@_);
	109
	110	return $self->{'num_sections'};
	111	}
	112
[1694]	113	# num_bytes is the actual number of bytes in the collection
	114	# this is normally the same as what's processed during text compression
[932]	115	sub get_num_bytes {
	116	my $self = shift (@_);
	117
	118	return $self->{'num_bytes'};
	119	}
	120
[1694]	121	# num_processed_bytes is the number of bytes actually passed
	122	# to mgpp for the current index
	123	sub get_num_processed_bytes {
	124	my $self = shift (@_);
	125
	126	return $self->{'num_processed_bytes'};
	127	}
	128
[932]	129	sub set_output_handle {
	130	my $self = shift (@_);
	131	my ($handle) = @_;
	132
	133	$self->{'output_handle'} = $handle;
	134	}
	135
	136	sub set_mode {
	137	my $self = shift (@_);
	138	my ($mode) = @_;
	139
	140	$self->{'mode'} = $mode;
	141	}
	142
	143	sub set_assocdir {
	144	my $self = shift (@_);
	145	my ($assocdir) = @_;
	146
	147	$self->{'assocdir'} = $assocdir;
	148	}
	149
	150	sub set_dontgdbm {
	151	my $self = shift (@_);
	152	my ($dontgdbm) = @_;
	153
	154	$self->{'dontgdbm'} = $dontgdbm;
	155	}
	156
	157	sub set_index {
	158	my $self = shift (@_);
	159	my ($index, $indexexparr) = @_;
	160
	161	$self->{'index'} = $index;
	162	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
	163	}
	164
[1694]	165	sub get_index {
	166	my $self = shift (@_);
	167
	168	return $self->{'index'};
	169	}
	170
[932]	171	sub set_classifiers {
	172	my $self = shift (@_);
	173	my ($classifiers) = @_;
	174
	175	$self->{'classifiers'} = $classifiers;
	176	}
	177
	178	sub set_indexing_text {
	179	my $self = shift (@_);
	180	my ($indexing_text) = @_;
	181
	182	$self->{'indexing_text'} = $indexing_text;
	183	}
	184
[1694]	185	sub get_indexing_text {
	186	my $self = shift (@_);
	187
	188	return $self->{'indexing_text'};
	189	}
	190
[2480]	191	sub set_store_text {
	192	my $self = shift (@_);
	193	my ($store_text) = @_;
	194
	195	$self->{'store_text'} = $store_text;
	196	}
	197
[2771]	198	sub get_doc_list {
	199	my $self = shift(@_);
	200
	201	return @{$self->{'doclist'}};
	202	}
	203
[1852]	204	sub set_indexfieldmap {
	205	my $self = shift (@_);
	206	my ($indexmap) = @_;
	207
	208	$self->{'indexfieldmap'} = $indexmap;
	209	}
	210
	211	sub get_indexfieldmap {
	212	my $self = shift (@_);
	213
	214	return $self->{'indexfieldmap'};
	215	}
	216
	217	sub set_levels {
	218	my $self = shift (@_);
	219	my ($levels) = @_;
	220
	221	$self->{'levels'} = $levels;
	222	}
	223
	224	sub set_strip_html {
	225	my $self = shift (@_);
	226	my ($strip) = @_;
	227	$self->{'strip_html'}=$strip;
	228	}
	229
[932]	230	sub process {
	231	my $self = shift (@_);
	232	my $method = $self->{'mode'};
	233
	234	$self->$method(@_);
	235	}
	236
	237	# use 'Paged' if document has no more than 2 levels
	238	# and each section at second level has a number for
	239	# Title metadata
[6553]	240	# also use Paged if gsdlthistype metadata is set to Paged
[932]	241	sub get_document_type {
	242	my $self = shift (@_);
	243	my ($doc_obj) = @_;
	244
	245	my $thistype = "VList";
	246	my $childtype = "VList";
	247	my $title;
	248	my @tmp = ();
	249
	250	my $section = $doc_obj->get_top_section ();
[6553]	251
	252	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
[6767]	253	if (defined $gsdlthistype) {
	254	if ($gsdlthistype eq "Paged") {
	255	$thistype = "Paged";
	256	$childtype = "Paged";
	257	return ($thistype, $childtype);
	258	} elsif ($gsdlthistype eq "Hierarchy") {
	259	return ($thistype, $childtype); # use VList, VList
	260	}
[6553]	261	}
[932]	262	my $first = 1;
	263	while (defined $section) {
	264	@tmp = split /\./, $section;
	265	if (scalar(@tmp) > 1) {
	266	return ($thistype, $childtype);
	267	}
	268	if (!$first) {
	269	$title = $doc_obj->get_metadata_element ($section, "Title");
	270	if (!defined $title \|\| $title !~ /^\d+$/) {
	271	return ($thistype, $childtype);
	272	}
	273	}
	274	$first = 0;
	275	$section = $doc_obj->get_next_section($section);
	276	}
	277	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	278	$thistype = "Paged";
	279	} else {
	280	$thistype = "Invisible";
	281	}
	282	$childtype = "Paged";
	283	return ($thistype, $childtype);
	284	}
	285
	286	sub assoc_files {
	287	my $self = shift (@_);
	288	my ($doc_obj, $archivedir) = @_;
	289	my ($afile);
	290
	291	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[3148]	292	# if assoc file starts with a slash, we put it relative to the assoc
	293	# dir, otherwise it is relative to the HASH... directory
	294	if ($assoc_file->[1] =~ m@^[/\\]@) {
	295	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
[932]	296	} else {
	297	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
	298	}
	299	&util::hard_link ($assoc_file->[0], $afile);
	300	}
	301	}
	302
	303	sub infodb {
	304	my $self = shift (@_);
	305	my ($doc_obj, $filename) = @_;
	306	my $handle = $self->{'output_handle'};
	307
	308	my $doctype = $doc_obj->get_doc_type();
	309
	310	# only output this document if it is one to be indexed
	311	return if ($doctype ne "indexed_doc");
	312
[1852]	313	#if a Section level index is not built, the gdbm file should be at doc
	314	#level not Section
	315	my $docs_only = 1;
[4811]	316	if ($self->{'levels'}->{'section'}) {
[1852]	317	$docs_only = 0;
	318	}
	319
[932]	320	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	321	$archivedir = "" unless defined $archivedir;
	322	$archivedir =~ s/\\/\//g;
	323	$archivedir =~ s/^\/+//;
	324	$archivedir =~ s/\/+$//;
	325
	326	$self->assoc_files ($doc_obj, $archivedir);
	327
	328	# this is another document
	329	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
	330
	331	# is this a paged or a hierarchical document
	332	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	333
	334	my $section = $doc_obj->get_top_section ();
	335	my $doc_OID = $doc_obj->get_OID();
	336	my $first = 1;
	337	my $url = "";
	338	while (defined $section) {
	339	# update a few statistics
	340	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	341	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
	342
	343	# output the section name
	344	if ($section eq "") { print $handle "[$doc_OID]\n"; }
	345	else { print $handle "[$doc_OID.$section]\n"; }
	346
[1694]	347	# output the fact that this document is a document (unless doctype
	348	# has been set to something else from within a plugin
	349	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	350	if (!defined $dtype \|\| $dtype !~ /\w/) {
	351	print $handle "<doctype>doc\n";
	352	}
[932]	353
	354	# output whether this node contains text
	355	if ($doc_obj->get_text_length($section) > 0) {
	356	print $handle "<hastxt>1\n";
	357	} else {
	358	print $handle "<hastxt>0\n";
	359	}
	360
	361	# output all the section metadata
	362	my $metadata = $doc_obj->get_all_metadata ($section);
[1917]	363	foreach my $pair (@$metadata) {
[932]	364	my ($field, $value) = (@$pair);
	365
[6546]	366	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
	367	defined $value && $value ne "") {
[932]	368
	369	# escape problematic stuff
	370	$value =~ s/\\/\\\\/g;
	371	$value =~ s/\n/\\n/g;
	372	$value =~ s/\r/\\r/g;
	373
	374	# special case for URL metadata
	375	if ($field =~ /^URL$/i) {
	376	$url .= "[$value]\n";
	377	if ($section eq "") {$url .= "<section>$doc_OID\n";}
	378	else {$url .= "<section>$doc_OID.$section\n";}
	379	$url .= '-' x 70 . "\n";
	380	}
	381
	382	if (!defined $self->{'dontgdbm'}->{$field}) {
	383	print $handle "<$field>$value\n";
	384	}
	385	}
	386	}
	387
	388	# output archivedir if at top level
	389	if ($section eq $doc_obj->get_top_section()) {
	390	print $handle "<archivedir>$archivedir\n";
	391	}
	392
	393	# output document display type
	394	if ($first) {
	395	print $handle "<thistype>$thistype\n";
	396	}
	397
[1852]	398	if (!$docs_only) {
	399	# output a list of children
	400	my $children = $doc_obj->get_children ($section);
	401	if (scalar(@$children) > 0) {
	402	print $handle "<childtype>$childtype\n";
	403	print $handle "<contains>";
	404	my $firstchild = 1;
	405	foreach $child (@$children) {
	406	print $handle ";" unless $firstchild;
	407	$firstchild = 0;
	408	if ($child =~ /^.*?\.(\d+)$/) {
	409	print $handle "\".$1";
	410	} else {
	411	print $handle "\".$child";
	412	}
[932]	413	# if ($child eq "") { print $handle "$doc_OID"; }
	414	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
	415	# else { print $handle "$doc_OID.$section.$child"; }
[1852]	416	}
	417	print $handle "\n";
[932]	418	}
[1852]	419	#output the matching doc number
	420	print $handle "<docnum>$self->{'num_sections'}\n";
	421
	422	} # if (!$docs_only)
	423	else { #docs only, doc num is num_docs not num_sections
	424	# output the matching document number
	425	print $handle "<docnum>$self->{'num_docs'}\n";
[932]	426	}
[1852]	427
[932]	428	print $handle '-' x 70, "\n";
	429
	430
	431	# output a database entry for the document number
[1852]	432	if ($docs_only) {
	433	print $handle "[$self->{'num_docs'}]\n";
	434	print $handle "<section>$doc_OID\n";
	435	}
	436	else {
	437	print $handle "[$self->{'num_sections'}]\n";
	438	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
	439	else { print $handle "<section>$doc_OID.$section\n"; }
	440	}
[932]	441	print $handle '-' x 70, "\n";
	442
	443	# output entry for url
	444	if ($url ne "") {
	445	print $handle $url;
	446	}
	447
	448	$first = 0;
	449	$section = $doc_obj->get_next_section($section);
[1852]	450	last if ($docs_only); # if no sections wanted, only gdbm the docs
[932]	451	}
	452
[2771]	453	#add this document to the browse structure
	454	push(@{$self->{'doclist'}},$doc_obj->get_OID())
	455	unless ($doctype eq "classification");
	456
[932]	457	# classify this document
	458	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
	459
	460	}
	461
[4769]	462	#sub find_paragraphs {
	463	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
	464	#}
[932]	465
[1852]	466	#this function strips the html tags from the doc if ($strip_html) and
	467	# if ($para) replaces <p> with <Paragraph> tags.
	468	# if both are false, the original text is returned
	469	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
	470	#these tags
	471	sub preprocess_text {
	472	my $self = shift (@_);
	473	my ($text, $strip_html, $para) = @_;
	474	my ($outtext) = "";
	475	if ($strip_html) {
	476	while ($text =~ /<([^>]*)>/ && $text ne "") {
	477
	478	$tag = $1;
	479	$outtext .= $`." "; #add everything before the matched tag
	480	$text = $'; #everything after the matched tag
[4769]	481	if ($para && $tag =~ /^\s*p\s/i) {
	482	$outtext .= $para;
[1852]	483	}
	484	elsif ($tag =~ /^pre$/) { # a pre tag
	485	$text =~ /<\/pre>/; # find the closing pre tag
	486	my $tmp_text = $`; #everything before the closing pre tag
	487	$text = $'; #everything after the </pre>
	488	$tmp_text =~ s/[<>]//g; # remove all < and >
	489	$outtext.= $tmp_text . " ";
	490	}
	491	}
	492
	493	$outtext .= $text; # add any remaining text
	494	return $outtext;
	495	} #if strip_html
	496
[4769]	497	#if ($para) {
	498	#$text =~ s/(<p\b)/$para$1/gi;
	499	#return $text;
	500	# }
[1852]	501	return $text;
	502	}
	503
	504
	505
[932]	506	sub filter_text {
	507	# $self->filter_text ($field, $new_text);
	508	# don't want to do anything for this version, however,
	509	# in a particular collection you might want to override
	510	# this method to post-process certain fields depending on
	511	# the field, or whether we are outputting it for indexing
	512	}
	513
	514	sub text {
	515	my $self = shift (@_);
	516	my ($doc_obj) = @_;
	517	my $handle = $self->{'output_handle'};
[4769]	518	my $outhandle = $self->{'outhandle'};
[932]	519	my $indexed_doc = 1;
	520
	521	# only output this document if it is one to be indexed
	522	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	523
	524	# see if this document belongs to this subcollection
[1917]	525	foreach my $indexexp (@{$self->{'indexexparr'}}) {
[932]	526	$indexed_doc = 0;
	527	my ($field, $exp, $options) = split /\//, $indexexp;
	528	if (defined ($field) && defined ($exp)) {
	529	my ($bool) = $field =~ /^(.)/;
	530	$field =~ s/^.// if $bool eq '!';
	531	if ($field =~ /^filename$/i) {
	532	$field = $doc_obj->get_source_filename();
	533	} else {
	534	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
	535	}
	536	next unless defined $field;
	537	if ($bool eq '!') {
	538	if ($options =~ /^i$/i) {
	539	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
	540	} else {
	541	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
	542	}
	543	} else {
	544	if ($options =~ /^i$/i) {
	545	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
	546	} else {
	547	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
	548	}
	549	}
	550	}
	551	}
	552
	553	# this is another document
	554	$self->{'num_docs'} += 1;
	555
	556	# get the parameters for the output
[5642]	557	# split on : just in case there is subcoll and lang stuff
	558	my ($fields) = split (/:/, $self->{'index'});
[932]	559
[4769]	560	my ($documenttag) = "";
	561	my($documentendtag) = "";
[4811]	562	if ($self->{'levels'}->{'document'}) {
[7090]	563	$documenttag = "\n<". $level_map{'document'} . ">\n";
	564	$documentendtag = "\n</". $level_map{'document'} . ">\n";
[4811]	565	}
[1852]	566	my ($sectiontag) = "";
[4811]	567	if ($self->{'levels'}->{'section'}) {
[7090]	568	$sectiontag = "\n<". $level_map{'section'} . ">\n";
[1852]	569	}
	570	my ($paratag) = "";
[4811]	571	if ($self->{'levels'}->{'paragraph'}) {
[4769]	572	if ($self->{'strip_html'}) {
[7090]	573	$paratag = "<". $level_map{'paragraph'} . ">";
[4769]	574	} else {
	575	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
	576	}
[1852]	577	}
[4811]	578
[932]	579	my $doc_section = 0; # just for this document
[4769]	580
	581	my $text = $documenttag;
[1917]	582
[932]	583	# get the text for this document
	584	my $section = $doc_obj->get_top_section();
	585	while (defined $section) {
	586	# update a few statistics
	587	$doc_section++;
	588	$self->{'num_sections'} += 1;
[4769]	589	$text .= "$sectiontag";
[1852]	590
[932]	591	if ($indexed_doc) {
[4769]	592	if ($self->{'indexing_text'}) {
	593	$text .= "$paratag"; # only add para tags for indexing
	594	# note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
	595	}
[932]	596	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[1917]	597	foreach my $field (split (/,/, $fields)) {
[932]	598	# only deal with this field if it doesn't start with top or
	599	# this is the first section
	600	my $real_field = $field;
	601	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
	602	my $new_text = "";
[1852]	603	my $tmp_text = "";
[932]	604	if ($real_field eq "text") {
[1852]	605	if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
[4769]	606	$new_text .= "$paratag<TX>\n";
	607	$tmp_text .= $doc_obj->get_text ($section);
	608	$tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
	609
[1852]	610	$new_text .= "$tmp_text</TX>\n";
[4769]	611	#if (!defined $self->{'indexfields'}->{'TextOnly'}) {
	612	#$self->{'indexfields'}->{'TextOnly'} = 1;
	613	#}
[1852]	614	}
	615	else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
[2480]	616	$new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
[1852]	617	}
[932]	618	} else { # metadata field
[4769]	619	if ($real_field eq "allfields") { #ignore
	620	}
	621	elsif ($real_field eq "metadata") { # insert all metadata
[1852]	622	#except gsdl stuff
	623	my $shortname = "";
[932]	624	my $metadata = $doc_obj->get_all_metadata ($section);
	625	foreach $pair (@$metadata) {
	626	my ($mfield, $mvalue) = (@$pair);
[1852]	627	# check fields here, maybe others dont want - change to use dontindex!!
[6546]	628	if ($mfield ne "Identifier"
	629	&& $mfield !~ /^gsdl/
[3226]	630	&& $mfield ne "classifytype"
	631	&& $mfield ne "assocfilepath"
	632	&& defined $mvalue && $mvalue ne "") {
[1852]	633
	634	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	635	$shortname = $self->{'indexfieldmap'}->{$mfield};
	636	}
	637	else {
	638	$shortname = $self->create_shortname($mfield);
	639	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	640	$self->{'indexfieldmap'}->{$shortname} = 1;
	641	}
	642	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
	643	if (!defined $self->{'indexfields'}->{$mfield}) {
	644	$self->{'indexfields'}->{$mfield} = 1;
	645	}
[932]	646	}
	647	}
[1852]	648
[932]	649	}
	650	else { #individual metadata specified
[1852]	651	my $shortname="";
[4769]	652	#if (!defined $self->{'indexfields'}->{$real_field}) {
	653	#$self->{'indexfields'}->{$real_field} = 1;
	654	#}
[1852]	655	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	656	$shortname = $self->{'indexfieldmap'}->{$real_field};
	657	}
	658	else {
	659	$shortname = $self->create_shortname($real_field);
	660	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	661	$self->{'indexfieldmap'}->{$shortname} = 1;
	662	}
[932]	663	foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
[1852]	664	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
[932]	665	}
	666	}
	667
	668	}
	669
	670	# filter the text
	671	$self->filter_text ($field, $new_text);
	672
[1852]	673	$self->{'num_processed_bytes'} += length ($new_text);
[932]	674	$text .= "$new_text";
	675	}
	676	}
	677	} # if (indexed_doc)
	678
	679	$section = $doc_obj->get_next_section($section);
	680	} #while defined section
[4769]	681	print $handle "$text\n$documentendtag";
	682
[932]	683	}
	684
[3195]	685	#chooses the first two letters or digits for the shortname
	686	#now ignores non-letdig characters
[1852]	687	sub create_shortname {
	688	$self = shift(@_);
	689
	690	my ($realname) = @_;
	691	#take the first two chars
[3195]	692	my $shortname;
	693	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
	694	$shortname = "$1$2";
	695	} else {
	696	# there aren't two letdig's in the field - try arbitrary combinations
	697	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	698	$shortname = "AB";
	699	}
[1852]	700	$shortname =~ tr/a-z/A-Z/;
	701
[3195]	702	#if already used, take the first and third letdigs and so on
[1852]	703	$count = 1;
	704	while (defined $self->{'indexfieldmap'}->{$shortname}) {
[3195]	705	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
	706	$shortname = "$1$3";
	707	$count++;
	708	$shortname =~ tr/a-z/A-Z/;
[1852]	709
	710	}
	711	else {
[3195]	712	#remove up to and incl the first letdig
	713	$realname =~ s/^[^\w]*\w//;
[1852]	714	$count = 0;
	715	}
	716	}
	717
	718	return $shortname;
	719	}
	720
[932]	721	1;
	722

Note: See TracBrowser for help on using the repository browser.

Download in other formats: