Context Navigation

mgbuildproc.pm@ 25758

Last change on this file since 25758 was 24404, checked in by ak19, 13 years ago
Changes to perl code to do with removing the ex. prefix: ex. is only removed if it is the sole prefix (i.e. ex.dc.* prefixes are not removed).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.7 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# mgbuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[17]	26	# This document processor outputs a document
	27	# for mg to process
[4]	28
	29	package mgbuildproc;
	30
[15738]	31
[9919]	32	use basebuildproc;
[15738]	33	use strict;
[3767]	34
[15738]	35
[4]	36	BEGIN {
[9919]	37	@mgbuildproc::ISA = ('basebuildproc');
[4]	38	}
	39
	40	sub new {
[9919]	41	my $class = shift @_;
	42	my $self = new basebuildproc (@_);
[4]	43	return bless $self, $class;
	44	}
	45
	46
[289]	47	sub find_paragraphs {
	48	$_[1] =~ s/(<p\b)/\cC$1/gi;
	49	}
	50
[4]	51	sub text {
	52	my $self = shift (@_);
	53	my ($doc_obj) = @_;
	54	my $handle = $self->{'output_handle'};
[9919]	55
[4]	56	# only output this document if it is one to be indexed
	57	return if ($doc_obj->get_doc_type() ne "indexed_doc");
[9919]	58
[69]	59	# see if this document belongs to this subcollection
[9919]	60	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[69]	61
[4]	62	# this is another document
	63	$self->{'num_docs'} += 1;
	64
	65	# get the parameters for the output
	66	my ($level, $fields) = split (/:/, $self->{'index'});
	67	$fields =~ s/\ball\b/Title,Creator,text/;
	68	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
	69
	70	my $doc_section = 0; # just for this document
	71	my $text = "";
	72	my $text_extra = "";
	73
	74	# get the text for this document
	75	my $section = $doc_obj->get_top_section();
	76	while (defined $section) {
	77	# update a few statistics
	78	$doc_section++;
	79	$self->{'num_sections'} += 1;
[69]	80
[12274]	81	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
[12356]	82	if (($indexed_doc) && ($indexed_section eq "indexed_section" \|\| $indexed_section eq "indexed_doc")) {
[69]	83	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[900]	84	foreach my $field (split (/,/, $fields)) {
[69]	85	# only deal with this field if it doesn't start with top or
	86	# this is the first section
	87	my $real_field = $field;
	88	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
	89	my $new_text = "";
[4743]	90	if ($level eq "dummy") {
	91	# a dummy index is a special case used when no
	92	# indexes are specified (since there must always be
	93	# at least one index or we can't retrieve the
	94	# compressed text) - we add a small amount of text
	95	# to these dummy indexes which will never be seen
	96	# but will overcome mg's problems with building
	97	# empty indexes
	98	$new_text = "this is dummy text to stop mg barfing";
	99	$self->{'num_processed_bytes'} += length ($new_text);
	100
	101	} elsif ($real_field eq "text") {
[2336]	102	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
[1251]	103	$self->{'num_processed_bytes'} += length ($new_text);
[69]	104	$new_text =~ s/[\cB\cC]//g;
[289]	105	$self->find_paragraphs($new_text);
[69]	106
	107	} else {
[1072]	108	my $first = 1;
[24404]	109	$real_field =~ s/^ex\.([^.]+)$/$1/; # remove ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
[10474]	110	my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
[12371]	111	if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
[10474]	112	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
	113	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
	114	}
	115	}
	116	foreach my $meta (@section_metadata) {
[1072]	117	$meta =~ s/[\cB\cC]//g;
[1251]	118	$self->{'num_processed_bytes'} += length ($meta);
[1072]	119	$new_text .= "\cC" unless $first;
[2336]	120	$new_text .= $meta if $self->{'store_text'};
[1072]	121	$first = 0;
	122	}
[69]	123	}
[4]	124
[292]	125	# filter the text
[17110]	126	$new_text = $self->filter_text ($field, $new_text);
[292]	127
[69]	128	$text .= "$new_text\cC";
[4]	129	}
	130	}
	131	}
	132
	133	if ($level eq "document") { $text_extra .= "\cB"; }
	134	else { $text .= "\cB"; }
	135
	136	$section = $doc_obj->get_next_section($section);
	137	}
	138
	139	print $handle "$text$text_extra";
	140	}
	141
	142	1;
	143

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/mgbuildproc.pm@ 25758

Download in other formats: