Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 12270

Last change on this file since 12270 was 10474, checked in by kjdon, 19 years ago
implemented sections_index_document_metadata
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.7 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use basebuildproc;
33
34	BEGIN {
35	@mgbuildproc::ISA = ('basebuildproc');
36	}
37
38	sub new {
39	my $class = shift @_;
40	my $self = new basebuildproc (@_);
41	return bless $self, $class;
42	}
43
44
45	sub find_paragraphs {
46	$_[1] =~ s/(<p\b)/\cC$1/gi;
47	}
48
49	sub filter_text {
50	# $self->filter_text ($field, $new_text);
51	# don't want to do anything for this version, however,
52	# in a particular collection you might want to override
53	# this method to post-process certain fields depending on
54	# the field, or whether we are outputting it for indexing
55	}
56
57	sub text {
58	my $self = shift (@_);
59	my ($doc_obj) = @_;
60	my $handle = $self->{'output_handle'};
61
62	# only output this document if it is one to be indexed
63	return if ($doc_obj->get_doc_type() ne "indexed_doc");
64
65	# see if this document belongs to this subcollection
66	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
67
68	# this is another document
69	$self->{'num_docs'} += 1;
70
71	# get the parameters for the output
72	my ($level, $fields) = split (/:/, $self->{'index'});
73	$fields =~ s/\ball\b/Title,Creator,text/;
74	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
75
76	my $doc_section = 0; # just for this document
77	my $text = "";
78	my $text_extra = "";
79
80	# get the text for this document
81	my $section = $doc_obj->get_top_section();
82	while (defined $section) {
83	# update a few statistics
84	$doc_section++;
85	$self->{'num_sections'} += 1;
86
87	if ($indexed_doc) {
88	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
89	foreach my $field (split (/,/, $fields)) {
90	# only deal with this field if it doesn't start with top or
91	# this is the first section
92	my $real_field = $field;
93	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
94	my $new_text = "";
95	if ($level eq "dummy") {
96	# a dummy index is a special case used when no
97	# indexes are specified (since there must always be
98	# at least one index or we can't retrieve the
99	# compressed text) - we add a small amount of text
100	# to these dummy indexes which will never be seen
101	# but will overcome mg's problems with building
102	# empty indexes
103	$new_text = "this is dummy text to stop mg barfing";
104	$self->{'num_processed_bytes'} += length ($new_text);
105
106	} elsif ($real_field eq "text") {
107	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
108	$self->{'num_processed_bytes'} += length ($new_text);
109	$new_text =~ s/[\cB\cC]//g;
110	$self->find_paragraphs($new_text);
111
112	} else {
113	my $first = 1;
114	my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
115	if ($level eq "section" && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
116	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
117	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
118	}
119	}
120	foreach my $meta (@section_metadata) {
121	$meta =~ s/[\cB\cC]//g;
122	$self->{'num_processed_bytes'} += length ($meta);
123	$new_text .= "\cC" unless $first;
124	$new_text .= $meta if $self->{'store_text'};
125	$first = 0;
126	}
127	}
128
129	# filter the text
130	$self->filter_text ($field, $new_text);
131
132	$text .= "$new_text\cC";
133	}
134	}
135	}
136
137	if ($level eq "document") { $text_extra .= "\cB"; }
138	else { $text .= "\cB"; }
139
140	$section = $doc_obj->get_next_section($section);
141	}
142
143	print $handle "$text$text_extra";
144	}
145
146	1;
147

Note: See TracBrowser for help on using the repository browser.

Download in other formats: