Context Navigation

source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 30599

Last change on this file since 30599 was 27328, checked in by kjdon, 11 years ago
changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Rev	Line
[14912]	1	###########################################################################
	2	#
	3	# mgppbuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document
	27	# for mgpp to process
	28
	29
	30	package mgppbuildproc;
	31
	32	use basebuildproc;
[17110]	33	use cnseg;
	34
[14912]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
	37
	38
	39	BEGIN {
	40	@mgppbuildproc::ISA = ('basebuildproc');
	41	}
	42
	43	#this must be the same as in mgppbuilder
	44	our %level_map = ('document'=>'Doc',
	45	'section'=>'Sec',
	46	'paragraph'=>'Para');
	47
[17564]	48	# change this so a user can add their own ones in via a file or cfg
	49	#add AND, OR, NOT NEAR to this list - these cannot be used as field names
	50	#also add the level names (Doc, Sec, Para)
	51	our %static_indexfield_map = ('Title'=>'TI',
	52	'TI'=>1,
	53	'Subject'=>'SU',
	54	'SU'=>1,
	55	'Creator'=>'CR',
	56	'CR'=>1,
	57	'Organization'=>'ORG',
	58	'ORG'=>1,
	59	'Source'=>'SO',
	60	'SO'=>1,
	61	'Howto'=>'HT',
	62	'HT'=>1,
	63	'ItemTitle'=>'IT',
	64	'IT'=>1,
	65	'ProgNumber'=>'PN',
	66	'PN'=>1,
	67	'People'=>'PE',
	68	'PE'=>1,
	69	'Coverage'=>'CO',
	70	'CO'=>1,
	71	'allfields'=>'ZZ',
	72	'ZZ'=>1,
	73	'text'=>'TX',
	74	'TX'=>1,
	75	'AND'=>1,
	76	'OR'=>1,
	77	'NOT'=>1,
	78	'NEAR'=>1,
	79	'Doc'=>1,
	80	'Sec'=>1,
	81	'Para'=>1);
	82
	83
[14912]	84	sub new {
	85	my $class = shift @_;
	86	my $self = new basebuildproc (@_);
	87
	88	# use a different index specification to the default
	89	$self->{'index'} = "text";
	90
	91	$self->{'dontindex'} = {};
[27328]	92	$self->{'allindexfields'} = {}; # list of all actually indexed fields
	93	$self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
	94	$self->{'fieldnamemap'} = {'allfields'=>'ZZ',
	95	'ZZ'=>1,
	96	'text'=>'TX',
	97	'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
[14912]	98	$self->{'strip_html'}=1;
	99
	100	return bless $self, $class;
	101	}
	102
	103	sub set_levels {
	104	my $self = shift (@_);
	105	my ($levels) = @_;
	106
	107	$self->{'levels'} = $levels;
	108	}
	109
	110	sub set_strip_html {
	111	my $self = shift (@_);
	112	my ($strip) = @_;
	113	$self->{'strip_html'}=$strip;
	114	}
	115
	116	#sub find_paragraphs {
	117	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
	118	#}
	119
	120	sub remove_gtlt {
	121	my $self =shift(@_);
	122	my ($text, $para) = @_;
	123	$text =~s/[<>]//g;
	124	return "$para$text$para";
	125	}
	126
	127	sub process_tags {
	128	my $self = shift(@_);
	129	my ($text, $para) = @_;
	130	if ($text =~ /^p\b/i) {
	131	return $para;
	132	}
	133	return "";
	134	}
	135
	136	sub preprocess_text {
	137	my $self = shift (@_);
	138	my ($text, $strip_html, $para) = @_;
	139	# at this stage, we do not do paragraph tags unless have strip_html -
	140	# it will result in a huge mess of non-xml
	141	return unless $strip_html;
	142
	143	my $new_text = $text;
	144
	145	# if we have <pre> tags, we can have < > inside them, need to delete
	146	# the <> before stripping tags
	147	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	148
	149	if ($para eq "") {
	150	# just remove all tags
	151	$new_text =~ s/<[^>]*>//gs;
	152	} else {
	153	# strip all tags except <p> tags which get turned into $para
	154	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	155
	156	}
	157	return $new_text;
	158	}
	159	#this function strips the html tags from the doc if ($strip_html) and
	160	# if ($para) replaces <p> with <Paragraph> tags.
	161	# if both are false, the original text is returned
	162	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
	163	#these tags
	164	sub preprocess_text_old_and_slow {
	165	my $self = shift (@_);
	166	my ($text, $strip_html, $para) = @_;
	167	my ($outtext) = "";
	168	if ($strip_html) {
	169	while ($text =~ /<([^>]*)>/ && $text ne "") {
	170
	171	my $tag = $1;
	172	$outtext .= $`." "; #add everything before the matched tag
	173	$text = $'; #'everything after the matched tag
	174	if ($para && $tag =~ /^\s*p\s/i) {
	175	$outtext .= $para;
	176	}
	177	elsif ($tag =~ /^pre$/) { # a pre tag
	178	$text =~ /<\/pre>/; # find the closing pre tag
	179	my $tmp_text = $`; #everything before the closing pre tag
	180	$text = $'; #'everything after the </pre>
	181	$tmp_text =~ s/[<>]//g; # remove all < and >
	182	$outtext.= $tmp_text . " ";
	183	}
	184	}
	185
	186	$outtext .= $text; # add any remaining text
	187	return $outtext;
	188	} #if strip_html
	189
	190	#if ($para) {
	191	#$text =~ s/(<p\b)/$para$1/gi;
	192	#return $text;
	193	# }
	194	return $text;
	195	}
	196
	197	sub text {
	198	my $self = shift (@_);
	199	my ($doc_obj) = @_;
	200	my $handle = $self->{'output_handle'};
	201	my $outhandle = $self->{'outhandle'};
	202
	203	# only output this document if it is one to be indexed
	204	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	205
	206	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	207
	208	# this is another document
	209	$self->{'num_docs'} += 1;
	210
	211	# get the parameters for the output
	212	# split on : just in case there is subcoll and lang stuff
	213	my ($fields) = split (/:/, $self->{'index'});
	214
	215	# we always do text and index on Doc and Sec levels
	216	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
	217	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
	218	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
	219	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
	220
	221	my ($paratag) = "";
	222
	223	# paragraph tags will only be used for indexing (can't retrieve
	224	# paragraphs), and can ony be used if we are stripping HTML tags
	225	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
	226	if ($self->{'strip_html'}) {
	227	$paratag = "<". $level_map{'paragraph'} . ">";
	228	} else {
	229	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
	230	}
	231	}
	232
	233	my $doc_section = 0; # just for this document
	234
	235	my $text = $documenttag;
	236
	237	# get the text for this document
	238	my $section = $doc_obj->get_top_section();
	239
	240	while (defined $section) {
	241	# update a few statistics
	242	$doc_section++;
	243	$self->{'num_sections'} += 1;
	244	$text .= "$sectiontag";
	245
	246	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
	247	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	248	# we are not actually indexing anything for this document,
	249	# but we want to keep the section numbers the same, so we just
	250	# output section tags for each section (which is done above)
	251	$text .= "$sectionendtag";
	252	$section = $doc_obj->get_next_section($section);
	253	next;
	254	}
	255
	256	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[17567]	257
	258	# has the user added a 'metadata' index?
	259	my $all_metadata_specified = 0;
	260	# which fields have already been indexed? (same as fields, but in a map)
	261	my $specified_fields = {};
[14912]	262	foreach my $field (split (/;/, $fields)) {
	263	# only deal with this field if it doesn't start with top or
	264	# this is the first section
	265	my $real_field = $field;
	266	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	267
	268	my $new_text = "";
	269
[27328]	270	# we get allfields by default
	271	next if ($real_field eq "allfields");
[14912]	272
	273	# metadata - output all metadata we know about except gsdl stuff
[17564]	274	# each metadata is in a separate index field
[17567]	275	if ($real_field eq "metadata") {
	276	# we will process this later, so we are not reindexing metadata already indexed
	277	$all_metadata_specified = 1;
[27328]	278	next;
[14912]	279	}
[17567]	280
[14912]	281	#individual metadata and or text specified - could be
	282	# a comma separated list
[17567]	283	$specified_fields->{$real_field} = 1;
[14912]	284	my $shortname="";
[17564]	285
[27328]	286	if (defined $self->{'fieldnamemap'}->{$real_field}) {
	287	$shortname = $self->{'fieldnamemap'}->{$real_field};
	288	} else {
	289	$shortname = $self->create_shortname($real_field);
	290	$self->{'fieldnamemap'}->{$real_field} = $shortname;
	291	$self->{'fieldnamemap'}->{$shortname} = 1;
	292	}
	293
[17564]	294	my @metadata_list = (); # put any meta values in here
	295	my $section_text = ""; # put any text in here
[14912]	296	foreach my $submeta (split /,/, $real_field) {
[17564]	297	if ($submeta eq "text") {
	298	# no point in indexing text more than once
	299	if ($section_text eq "") {
	300	$section_text = $doc_obj->get_text($section);
	301	if ($self->{'indexing_text'}) {
	302	if ($paratag ne "") {
	303	# we fiddle around with splitting text into paragraphs
	304	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
	305	}
	306	else {
	307	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
	308	}
[14912]	309	}
	310	}
	311	}
	312	else {
[24404]	313	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
[17564]	314	# its a metadata element
[14912]	315	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	316	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	317	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	318	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	319	}
	320	}
	321	push (@metadata_list, @section_metadata);
	322	}
[17564]	323	} # for each field in index
	324
	325
	326	# now we add the text and/or the metadata into new_text
	327	if ($section_text ne "" \|\| scalar(@metadata_list)) {
[17797]	328	if ($self->{'indexing_text'}) {
	329	# only add tags in if indexing
	330	$new_text .= "$paratag<$shortname>";
	331	}
[17564]	332	if ($section_text ne "") {
	333	$new_text .= "$section_text ";
[17797]	334	if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
[17564]	335	$new_text .= "</$shortname>$paratag<$shortname>";
	336	}
	337	}
	338	foreach my $item (@metadata_list) {
	339	$new_text .= "$item ";
	340	}
[17797]	341	if ($self->{'indexing_text'}) {
	342	# only add tags in if indexing
	343	$new_text .= "</$shortname>";
[27328]	344	$self->{'allindexfields'}->{$real_field} = 1;
[17797]	345	}
[14912]	346	}
[17564]	347
[14912]	348	# filter the text
[17110]	349	$new_text = $self->filter_text ($field, $new_text);
[14912]	350
	351	$self->{'num_processed_bytes'} += length ($new_text);
	352	$text .= "$new_text";
	353	} # foreach field
[17567]	354
	355	if ($all_metadata_specified) {
	356	my $new_text = "";
	357	my $shortname = "";
	358	my $metadata = $doc_obj->get_all_metadata ($section);
	359	foreach my $pair (@$metadata) {
	360	my ($mfield, $mvalue) = (@$pair);
	361	# no value
	362	next unless defined $mvalue && $mvalue ne "";
	363	# we have already indexed this
	364	next if defined ($specified_fields->{$mfield});
	365	# check fields here, maybe others dont want - change to use dontindex!!
	366	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
	367	next if ($mfield =~ /^gsdl/);
	368
[27328]	369	if (defined $self->{'fieldnamemap'}->{$mfield}) {
	370	$shortname = $self->{'fieldnamemap'}->{$mfield};
	371	} else {
	372	$shortname = $self->create_shortname($mfield);
	373	$self->{'fieldnamemap'}->{$mfield} = $shortname;
	374	$self->{'fieldnamemap'}->{$shortname} = 1;
[17567]	375	}
[27328]	376	$self->{'allindexfields'}->{$mfield} = 1;
[17567]	377	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
[27328]	378	if (!defined $self->{'extraindexfields'}->{$mfield}) {
	379	$self->{'extraindexfields'}->{$mfield} = 1;
[17567]	380	}
	381
	382	}
	383	# filter the text
	384	$new_text = $self->filter_text ("metadata", $new_text);
	385
	386	$self->{'num_processed_bytes'} += length ($new_text);
	387	$text .= "$new_text";
	388
	389
	390	}
	391
[14912]	392	$text .= "$sectionendtag";
	393	$section = $doc_obj->get_next_section($section);
	394	} # while defined section
	395	print $handle "$text\n$documentendtag";
[17567]	396	#print STDERR "*********\n$text\n*************\n";
[14912]	397
	398	}
	399
	400	#chooses the first two letters or digits for the shortname
	401	#now ignores non-letdig characters
	402	sub create_shortname {
	403	my $self = shift(@_);
	404
	405	my ($realname) = @_;
[27328]	406	my @realnamelist = split(",", $realname);
	407	map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
	408	my ($singlename) = $realnamelist[0];
	409
[17564]	410	# try our predefined static mapping
[27328]	411	my $name;
	412	if (defined ($name = $static_indexfield_map{$singlename})) {
	413	if (! defined $self->{'fieldnamemap'}->{$name}) {
	414	# has this shortname already been used??
	415	return $static_indexfield_map{$singlename};
	416	}
[17564]	417	}
[27328]	418	# we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
	419	$realname = join ("", @realnamelist);
[17564]	420	#try the first two chars
[14912]	421	my $shortname;
	422	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
	423	$shortname = "$1$2";
	424	} else {
	425	# there aren't two letdig's in the field - try arbitrary combinations
	426	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	427	$shortname = "AB";
	428	}
	429	$shortname =~ tr/a-z/A-Z/;
	430
	431	#if already used, take the first and third letdigs and so on
	432	my $count = 1;
[27328]	433	while (defined $self->{'fieldnamemap'}->{$shortname} \|\| defined $static_indexfield_map{$shortname}) {
[14912]	434	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
	435	$shortname = "$1$3";
	436	$count++;
	437	$shortname =~ tr/a-z/A-Z/;
	438
	439	}
	440	else {
	441	#remove up to and incl the first letdig
	442	$realname =~ s/^[^\w]*\w//;
	443	$count = 0;
	444	}
	445	}
	446
	447	return $shortname;
	448	}
	449
	450	1;
	451

Note: See TracBrowser for help on using the repository browser.

Download in other formats: