Context Navigation

source: gsdl/branches/protos-branch/perllib/mgppbuildproc.pm@ 18665

Last change on this file since 18665 was 13590, checked in by kjdon, 17 years ago
mgpp and lucene. made them always use doc and sec levels for the text regardless of index level specification. mgpp will always index at doc and sec level, but these options may not be presented to the user. this is to ensure that if we have sectioned documents, we don't need to turn on section indexing in order for the document display to use sections
Property svn:keywords set to `Author Date Id Revision`
File size: 11.2 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36
37	BEGIN {
38	@mgppbuildproc::ISA = ('basebuildproc');
39	}
40
41	#this must be the same as in mgppbuilder
42	our %level_map = ('document'=>'Doc',
43	'section'=>'Sec',
44	'paragraph'=>'Para');
45
46	sub new {
47	my $class = shift @_;
48	my $self = new basebuildproc (@_);
49
50	# use a different index specification to the default
51	$self->{'index'} = "text";
52
53	$self->{'dontindex'} = {};
54	$self->{'indexfieldmap'} = {};
55	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
56	$self->{'strip_html'}=1;
57
58	return bless $self, $class;
59	}
60
61
62	sub set_indexfieldmap {
63	my $self = shift (@_);
64	my ($indexmap) = @_;
65
66	$self->{'indexfieldmap'} = $indexmap;
67	}
68
69	sub get_indexfieldmap {
70	my $self = shift (@_);
71
72	return $self->{'indexfieldmap'};
73	}
74
75	sub set_levels {
76	my $self = shift (@_);
77	my ($levels) = @_;
78
79	$self->{'levels'} = $levels;
80	}
81
82	sub set_strip_html {
83	my $self = shift (@_);
84	my ($strip) = @_;
85	$self->{'strip_html'}=$strip;
86	}
87
88	#sub find_paragraphs {
89	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
90	#}
91
92	sub remove_gtlt {
93	my $self =shift(@_);
94	my ($text, $para) = @_;
95	$text =~s/[<>]//g;
96	return "$para$text$para";
97	}
98
99	sub process_tags {
100	my $self = shift(@_);
101	my ($text, $para) = @_;
102	if ($text =~ /^p\b/i) {
103	return $para;
104	}
105	return "";
106	}
107
108	sub preprocess_text {
109	my $self = shift (@_);
110	my ($text, $strip_html, $para) = @_;
111	# at this stage, we do not do paragraph tags unless have strip_html -
112	# it will result in a huge mess of non-xml
113	return unless $strip_html;
114
115	my $new_text = $text;
116
117	# if we have <pre> tags, we can have < > inside them, need to delete
118	# the <> before stripping tags
119	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
120
121	if ($para eq "") {
122	# just remove all tags
123	$new_text =~ s/<[^>]*>//gs;
124	} else {
125	# strip all tags except <p> tags which get turned into $para
126	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
127
128	}
129	return $new_text;
130	}
131	#this function strips the html tags from the doc if ($strip_html) and
132	# if ($para) replaces <p> with <Paragraph> tags.
133	# if both are false, the original text is returned
134	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
135	#these tags
136	sub preprocess_text_old_and_slow {
137	my $self = shift (@_);
138	my ($text, $strip_html, $para) = @_;
139	my ($outtext) = "";
140	if ($strip_html) {
141	while ($text =~ /<([^>]*)>/ && $text ne "") {
142
143	my $tag = $1;
144	$outtext .= $`." "; #add everything before the matched tag
145	$text = $'; #'everything after the matched tag
146	if ($para && $tag =~ /^\s*p\s/i) {
147	$outtext .= $para;
148	}
149	elsif ($tag =~ /^pre$/) { # a pre tag
150	$text =~ /<\/pre>/; # find the closing pre tag
151	my $tmp_text = $`; #everything before the closing pre tag
152	$text = $'; #'everything after the </pre>
153	$tmp_text =~ s/[<>]//g; # remove all < and >
154	$outtext.= $tmp_text . " ";
155	}
156	}
157
158	$outtext .= $text; # add any remaining text
159	return $outtext;
160	} #if strip_html
161
162	#if ($para) {
163	#$text =~ s/(<p\b)/$para$1/gi;
164	#return $text;
165	# }
166	return $text;
167	}
168
169
170
171	sub filter_text {
172	# $self->filter_text ($field, $new_text);
173	# don't want to do anything for this version, however,
174	# in a particular collection you might want to override
175	# this method to post-process certain fields depending on
176	# the field, or whether we are outputting it for indexing
177	}
178
179	sub text {
180	my $self = shift (@_);
181	my ($doc_obj) = @_;
182	my $handle = $self->{'output_handle'};
183	my $outhandle = $self->{'outhandle'};
184
185	# only output this document if it is one to be indexed
186	return if ($doc_obj->get_doc_type() ne "indexed_doc");
187
188	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
189
190	# this is another document
191	$self->{'num_docs'} += 1;
192
193	# get the parameters for the output
194	# split on : just in case there is subcoll and lang stuff
195	my ($fields) = split (/:/, $self->{'index'});
196
197	# we always do text and index on Doc and Sec levels
198	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
199	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
200	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
201	my ($paratag) = "";
202
203	# paragraph tags will only be used for indexing (can't retrieve
204	# paragraphs), and can ony be used if we are stripping HTML tags
205	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
206	if ($self->{'strip_html'}) {
207	$paratag = "<". $level_map{'paragraph'} . ">";
208	} else {
209	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
210	}
211	}
212
213	my $doc_section = 0; # just for this document
214
215	my $text = $documenttag;
216
217	# get the text for this document
218	my $section = $doc_obj->get_top_section();
219
220	while (defined $section) {
221	# update a few statistics
222	$doc_section++;
223	$self->{'num_sections'} += 1;
224	$text .= "$sectiontag";
225
226	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
227	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
228	# we are not actually indexing anything for this document,
229	# but we want to keep the section numbers the same, so we just
230	# output section tags for each section (which is done above)
231	$section = $doc_obj->get_next_section($section);
232	next;
233	}
234
235	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
236	foreach my $field (split (/;/, $fields)) {
237	# only deal with this field if it doesn't start with top or
238	# this is the first section
239	my $real_field = $field;
240	next if (($real_field =~ s/^top//) && ($doc_section != 1));
241
242	my $new_text = "";
243
244	# we get allfields by default - do nothing
245	if ($real_field eq "allfields") {
246
247	}
248
249	# metadata - output all metadata we know about except gsdl stuff
250	elsif ($real_field eq "metadata") {
251	my $shortname = "";
252	my $metadata = $doc_obj->get_all_metadata ($section);
253	foreach my $pair (@$metadata) {
254	my ($mfield, $mvalue) = (@$pair);
255	# check fields here, maybe others dont want - change to use dontindex!!
256	if ($mfield ne "Identifier"
257	&& $mfield !~ /^gsdl/
258	&& $mfield ne "classifytype"
259	&& $mfield ne "assocfilepath"
260	&& defined $mvalue && $mvalue ne "") {
261
262	if (defined $self->{'indexfieldmap'}->{$mfield}) {
263	$shortname = $self->{'indexfieldmap'}->{$mfield};
264	}
265	else {
266	$shortname = $self->create_shortname($mfield);
267	$self->{'indexfieldmap'}->{$mfield} = $shortname;
268	$self->{'indexfieldmap'}->{$shortname} = 1;
269	}
270	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
271	if (!defined $self->{'indexfields'}->{$mfield}) {
272	$self->{'indexfields'}->{$mfield} = 1;
273	}
274	}
275	}
276	}
277	else {
278	#individual metadata and or text specified - could be
279	# a comma separated list
280	my $shortname="";
281	if (defined $self->{'indexfieldmap'}->{$real_field}) {
282	$shortname = $self->{'indexfieldmap'}->{$real_field};
283	}
284	else {
285	$shortname = $self->create_shortname($real_field);
286	$self->{'indexfieldmap'}->{$real_field} = $shortname;
287	$self->{'indexfieldmap'}->{$shortname} = 1;
288	}
289	my @metadata_list = ();
290	foreach my $submeta (split /,/, $real_field) {
291	if ($submeta eq "text") {
292	my $section_text = $doc_obj->get_text($section);
293	if ($self->{'indexing_text'}) {
294	# tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
295	$new_text .= "$paratag<$shortname>\n";
296	if ($paratag ne "") {
297	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
298	}
299	else {
300	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
301	}
302	$new_text .= "$section_text</$shortname>\n";
303	}
304	else {
305	# leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
306	$new_text .= $section_text;
307	}
308	}
309	else {
310	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
311	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
312	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
313	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
314	}
315	}
316	push (@metadata_list, @section_metadata);
317	}
318	}
319	foreach my $item (@metadata_list) {
320	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
321	}
322	}
323
324	# filter the text
325	$self->filter_text ($field, $new_text);
326
327	$self->{'num_processed_bytes'} += length ($new_text);
328	$text .= "$new_text";
329	} # foreach field
330
331	$section = $doc_obj->get_next_section($section);
332	} # while defined section
333	print $handle "$text\n$documentendtag";
334
335	}
336
337	#chooses the first two letters or digits for the shortname
338	#now ignores non-letdig characters
339	sub create_shortname {
340	my $self = shift(@_);
341
342	my ($realname) = @_;
343	#take the first two chars
344	my $shortname;
345	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
346	$shortname = "$1$2";
347	} else {
348	# there aren't two letdig's in the field - try arbitrary combinations
349	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
350	$shortname = "AB";
351	}
352	$shortname =~ tr/a-z/A-Z/;
353
354	#if already used, take the first and third letdigs and so on
355	my $count = 1;
356	while (defined $self->{'indexfieldmap'}->{$shortname}) {
357	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
358	$shortname = "$1$3";
359	$count++;
360	$shortname =~ tr/a-z/A-Z/;
361
362	}
363	else {
364	#remove up to and incl the first letdig
365	$realname =~ s/^[^\w]*\w//;
366	$count = 0;
367	}
368	}
369
370	return $shortname;
371	}
372
373	1;
374

Note: See TracBrowser for help on using the repository browser.

Download in other formats: