Context Navigation

source: gsdl/trunk/perllib/mgppbuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago
changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc
Property svn:keywords set to `Author Date Id Revision`
File size: 11.4 KB

Rev	Line
[14912]	1	###########################################################################
	2	#
	3	# mgppbuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document
	27	# for mgpp to process
	28
	29
	30	package mgppbuildproc;
	31
	32	use basebuildproc;
[17110]	33	use cnseg;
	34
[14912]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
	37
	38
	39	BEGIN {
	40	@mgppbuildproc::ISA = ('basebuildproc');
	41	}
	42
	43	#this must be the same as in mgppbuilder
	44	our %level_map = ('document'=>'Doc',
	45	'section'=>'Sec',
	46	'paragraph'=>'Para');
	47
	48	sub new {
	49	my $class = shift @_;
	50	my $self = new basebuildproc (@_);
	51
	52	# use a different index specification to the default
	53	$self->{'index'} = "text";
	54
	55	$self->{'dontindex'} = {};
	56	$self->{'indexfieldmap'} = {};
	57	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
	58	$self->{'strip_html'}=1;
	59
	60	return bless $self, $class;
	61	}
	62
	63
	64	sub set_indexfieldmap {
	65	my $self = shift (@_);
	66	my ($indexmap) = @_;
	67
	68	$self->{'indexfieldmap'} = $indexmap;
	69	}
	70
	71	sub get_indexfieldmap {
	72	my $self = shift (@_);
	73
	74	return $self->{'indexfieldmap'};
	75	}
	76
	77	sub set_levels {
	78	my $self = shift (@_);
	79	my ($levels) = @_;
	80
	81	$self->{'levels'} = $levels;
	82	}
	83
	84	sub set_strip_html {
	85	my $self = shift (@_);
	86	my ($strip) = @_;
	87	$self->{'strip_html'}=$strip;
	88	}
	89
	90	#sub find_paragraphs {
	91	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
	92	#}
	93
	94	sub remove_gtlt {
	95	my $self =shift(@_);
	96	my ($text, $para) = @_;
	97	$text =~s/[<>]//g;
	98	return "$para$text$para";
	99	}
	100
	101	sub process_tags {
	102	my $self = shift(@_);
	103	my ($text, $para) = @_;
	104	if ($text =~ /^p\b/i) {
	105	return $para;
	106	}
	107	return "";
	108	}
	109
	110	sub preprocess_text {
	111	my $self = shift (@_);
	112	my ($text, $strip_html, $para) = @_;
	113	# at this stage, we do not do paragraph tags unless have strip_html -
	114	# it will result in a huge mess of non-xml
	115	return unless $strip_html;
	116
	117	my $new_text = $text;
	118
	119	# if we have <pre> tags, we can have < > inside them, need to delete
	120	# the <> before stripping tags
	121	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
	122
	123	if ($para eq "") {
	124	# just remove all tags
	125	$new_text =~ s/<[^>]*>//gs;
	126	} else {
	127	# strip all tags except <p> tags which get turned into $para
	128	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
	129
	130	}
	131	return $new_text;
	132	}
	133	#this function strips the html tags from the doc if ($strip_html) and
	134	# if ($para) replaces <p> with <Paragraph> tags.
	135	# if both are false, the original text is returned
	136	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
	137	#these tags
	138	sub preprocess_text_old_and_slow {
	139	my $self = shift (@_);
	140	my ($text, $strip_html, $para) = @_;
	141	my ($outtext) = "";
	142	if ($strip_html) {
	143	while ($text =~ /<([^>]*)>/ && $text ne "") {
	144
	145	my $tag = $1;
	146	$outtext .= $`." "; #add everything before the matched tag
	147	$text = $'; #'everything after the matched tag
	148	if ($para && $tag =~ /^\s*p\s/i) {
	149	$outtext .= $para;
	150	}
	151	elsif ($tag =~ /^pre$/) { # a pre tag
	152	$text =~ /<\/pre>/; # find the closing pre tag
	153	my $tmp_text = $`; #everything before the closing pre tag
	154	$text = $'; #'everything after the </pre>
	155	$tmp_text =~ s/[<>]//g; # remove all < and >
	156	$outtext.= $tmp_text . " ";
	157	}
	158	}
	159
	160	$outtext .= $text; # add any remaining text
	161	return $outtext;
	162	} #if strip_html
	163
	164	#if ($para) {
	165	#$text =~ s/(<p\b)/$para$1/gi;
	166	#return $text;
	167	# }
	168	return $text;
	169	}
	170
	171	sub text {
	172	my $self = shift (@_);
	173	my ($doc_obj) = @_;
	174	my $handle = $self->{'output_handle'};
	175	my $outhandle = $self->{'outhandle'};
	176
	177	# only output this document if it is one to be indexed
	178	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	179
	180	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
	181
	182	# this is another document
	183	$self->{'num_docs'} += 1;
	184
	185	# get the parameters for the output
	186	# split on : just in case there is subcoll and lang stuff
	187	my ($fields) = split (/:/, $self->{'index'});
	188
	189	# we always do text and index on Doc and Sec levels
	190	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
	191	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
	192	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
	193	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
	194
	195	my ($paratag) = "";
	196
	197	# paragraph tags will only be used for indexing (can't retrieve
	198	# paragraphs), and can ony be used if we are stripping HTML tags
	199	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
	200	if ($self->{'strip_html'}) {
	201	$paratag = "<". $level_map{'paragraph'} . ">";
	202	} else {
	203	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
	204	}
	205	}
	206
	207	my $doc_section = 0; # just for this document
	208
	209	my $text = $documenttag;
	210
	211	# get the text for this document
	212	my $section = $doc_obj->get_top_section();
	213
	214	while (defined $section) {
	215	# update a few statistics
	216	$doc_section++;
	217	$self->{'num_sections'} += 1;
	218	$text .= "$sectiontag";
	219
	220	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
	221	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
	222	# we are not actually indexing anything for this document,
	223	# but we want to keep the section numbers the same, so we just
	224	# output section tags for each section (which is done above)
	225	$text .= "$sectionendtag";
	226	$section = $doc_obj->get_next_section($section);
	227	next;
	228	}
	229
	230	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	231	foreach my $field (split (/;/, $fields)) {
	232	# only deal with this field if it doesn't start with top or
	233	# this is the first section
	234	my $real_field = $field;
	235	next if (($real_field =~ s/^top//) && ($doc_section != 1));
	236
	237	my $new_text = "";
	238
	239	# we get allfields by default - do nothing
	240	if ($real_field eq "allfields") {
	241
	242	}
	243
	244	# metadata - output all metadata we know about except gsdl stuff
	245	elsif ($real_field eq "metadata") {
	246	my $shortname = "";
	247	my $metadata = $doc_obj->get_all_metadata ($section);
	248	foreach my $pair (@$metadata) {
	249	my ($mfield, $mvalue) = (@$pair);
	250	# check fields here, maybe others dont want - change to use dontindex!!
	251	if ($mfield ne "Identifier"
	252	&& $mfield !~ /^gsdl/
	253	&& $mfield ne "classifytype"
	254	&& $mfield ne "assocfilepath"
	255	&& defined $mvalue && $mvalue ne "") {
	256
	257	if (defined $self->{'indexfieldmap'}->{$mfield}) {
	258	$shortname = $self->{'indexfieldmap'}->{$mfield};
	259	}
	260	else {
	261	$shortname = $self->create_shortname($mfield);
	262	$self->{'indexfieldmap'}->{$mfield} = $shortname;
	263	$self->{'indexfieldmap'}->{$shortname} = 1;
	264	}
	265	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
	266	if (!defined $self->{'indexfields'}->{$mfield}) {
	267	$self->{'indexfields'}->{$mfield} = 1;
	268	}
	269	}
	270	}
	271	}
	272	else {
	273	#individual metadata and or text specified - could be
	274	# a comma separated list
	275	my $shortname="";
	276	if (defined $self->{'indexfieldmap'}->{$real_field}) {
	277	$shortname = $self->{'indexfieldmap'}->{$real_field};
	278	}
	279	else {
	280	$shortname = $self->create_shortname($real_field);
	281	$self->{'indexfieldmap'}->{$real_field} = $shortname;
	282	$self->{'indexfieldmap'}->{$shortname} = 1;
	283	}
	284	my @metadata_list = ();
	285	foreach my $submeta (split /,/, $real_field) {
	286	if ($submeta eq "text") {
	287	my $section_text = $doc_obj->get_text($section);
	288	if ($self->{'indexing_text'}) {
	289	# tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
	290	$new_text .= "$paratag<$shortname>\n";
	291	if ($paratag ne "") {
	292	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
	293	}
	294	else {
	295	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
	296	}
	297	$new_text .= "$section_text</$shortname>\n";
	298	}
	299	else {
	300	# leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
	301	$new_text .= $section_text;
	302	}
	303	}
	304	else {
	305	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
	306	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
	307	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	308	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
	309	}
	310	}
	311	push (@metadata_list, @section_metadata);
	312	}
	313	}
	314	foreach my $item (@metadata_list) {
	315	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
	316	}
	317	}
	318
	319	# filter the text
[17110]	320	$new_text = $self->filter_text ($field, $new_text);
[14912]	321
	322	$self->{'num_processed_bytes'} += length ($new_text);
	323	$text .= "$new_text";
	324	} # foreach field
	325
	326	$text .= "$sectionendtag";
	327	$section = $doc_obj->get_next_section($section);
	328	} # while defined section
	329	print $handle "$text\n$documentendtag";
	330
	331	}
	332
	333	#chooses the first two letters or digits for the shortname
	334	#now ignores non-letdig characters
	335	sub create_shortname {
	336	my $self = shift(@_);
	337
	338	my ($realname) = @_;
	339	#take the first two chars
	340	my $shortname;
	341	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
	342	$shortname = "$1$2";
	343	} else {
	344	# there aren't two letdig's in the field - try arbitrary combinations
	345	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	346	$shortname = "AB";
	347	}
	348	$shortname =~ tr/a-z/A-Z/;
	349
	350	#if already used, take the first and third letdigs and so on
	351	my $count = 1;
	352	while (defined $self->{'indexfieldmap'}->{$shortname}) {
	353	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
	354	$shortname = "$1$3";
	355	$count++;
	356	$shortname =~ tr/a-z/A-Z/;
	357
	358	}
	359	else {
	360	#remove up to and incl the first letdig
	361	$realname =~ s/^[^\w]*\w//;
	362	$count = 0;
	363	}
	364	}
	365
	366	return $shortname;
	367	}
	368
	369	1;
	370

Note: See TracBrowser for help on using the repository browser.

Download in other formats: