Context Navigation

source: gsdl/trunk/perllib/mgppbuildproc.pm@ 17144

Last change on this file since 17144 was 17117, checked in by kjdon, 16 years ago
when indexing a combined field, put the field tags arounds the whole content, not each metadata value in the field. otherwise searching across fields doesn't work
Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	sub new {
49	my $class = shift @_;
50	my $self = new basebuildproc (@_);
51
52	# use a different index specification to the default
53	$self->{'index'} = "text";
54
55	$self->{'dontindex'} = {};
56	$self->{'indexfieldmap'} = {};
57	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
58	$self->{'strip_html'}=1;
59
60	return bless $self, $class;
61	}
62
63
64	sub set_indexfieldmap {
65	my $self = shift (@_);
66	my ($indexmap) = @_;
67
68	$self->{'indexfieldmap'} = $indexmap;
69	}
70
71	sub get_indexfieldmap {
72	my $self = shift (@_);
73
74	return $self->{'indexfieldmap'};
75	}
76
77	sub set_levels {
78	my $self = shift (@_);
79	my ($levels) = @_;
80
81	$self->{'levels'} = $levels;
82	}
83
84	sub set_strip_html {
85	my $self = shift (@_);
86	my ($strip) = @_;
87	$self->{'strip_html'}=$strip;
88	}
89
90	#sub find_paragraphs {
91	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
92	#}
93
94	sub remove_gtlt {
95	my $self =shift(@_);
96	my ($text, $para) = @_;
97	$text =~s/[<>]//g;
98	return "$para$text$para";
99	}
100
101	sub process_tags {
102	my $self = shift(@_);
103	my ($text, $para) = @_;
104	if ($text =~ /^p\b/i) {
105	return $para;
106	}
107	return "";
108	}
109
110	sub preprocess_text {
111	my $self = shift (@_);
112	my ($text, $strip_html, $para) = @_;
113	# at this stage, we do not do paragraph tags unless have strip_html -
114	# it will result in a huge mess of non-xml
115	return unless $strip_html;
116
117	my $new_text = $text;
118
119	# if we have <pre> tags, we can have < > inside them, need to delete
120	# the <> before stripping tags
121	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
122
123	if ($para eq "") {
124	# just remove all tags
125	$new_text =~ s/<[^>]*>//gs;
126	} else {
127	# strip all tags except <p> tags which get turned into $para
128	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
129
130	}
131	return $new_text;
132	}
133	#this function strips the html tags from the doc if ($strip_html) and
134	# if ($para) replaces <p> with <Paragraph> tags.
135	# if both are false, the original text is returned
136	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
137	#these tags
138	sub preprocess_text_old_and_slow {
139	my $self = shift (@_);
140	my ($text, $strip_html, $para) = @_;
141	my ($outtext) = "";
142	if ($strip_html) {
143	while ($text =~ /<([^>]*)>/ && $text ne "") {
144
145	my $tag = $1;
146	$outtext .= $`." "; #add everything before the matched tag
147	$text = $'; #'everything after the matched tag
148	if ($para && $tag =~ /^\s*p\s/i) {
149	$outtext .= $para;
150	}
151	elsif ($tag =~ /^pre$/) { # a pre tag
152	$text =~ /<\/pre>/; # find the closing pre tag
153	my $tmp_text = $`; #everything before the closing pre tag
154	$text = $'; #'everything after the </pre>
155	$tmp_text =~ s/[<>]//g; # remove all < and >
156	$outtext.= $tmp_text . " ";
157	}
158	}
159
160	$outtext .= $text; # add any remaining text
161	return $outtext;
162	} #if strip_html
163
164	#if ($para) {
165	#$text =~ s/(<p\b)/$para$1/gi;
166	#return $text;
167	# }
168	return $text;
169	}
170
171	sub text {
172	my $self = shift (@_);
173	my ($doc_obj) = @_;
174	my $handle = $self->{'output_handle'};
175	my $outhandle = $self->{'outhandle'};
176
177	# only output this document if it is one to be indexed
178	return if ($doc_obj->get_doc_type() ne "indexed_doc");
179
180	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
181
182	# this is another document
183	$self->{'num_docs'} += 1;
184
185	# get the parameters for the output
186	# split on : just in case there is subcoll and lang stuff
187	my ($fields) = split (/:/, $self->{'index'});
188
189	# we always do text and index on Doc and Sec levels
190	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
191	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
192	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
193	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
194
195	my ($paratag) = "";
196
197	# paragraph tags will only be used for indexing (can't retrieve
198	# paragraphs), and can ony be used if we are stripping HTML tags
199	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
200	if ($self->{'strip_html'}) {
201	$paratag = "<". $level_map{'paragraph'} . ">";
202	} else {
203	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
204	}
205	}
206
207	my $doc_section = 0; # just for this document
208
209	my $text = $documenttag;
210
211	# get the text for this document
212	my $section = $doc_obj->get_top_section();
213
214	while (defined $section) {
215	# update a few statistics
216	$doc_section++;
217	$self->{'num_sections'} += 1;
218	$text .= "$sectiontag";
219
220	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
221	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
222	# we are not actually indexing anything for this document,
223	# but we want to keep the section numbers the same, so we just
224	# output section tags for each section (which is done above)
225	$text .= "$sectionendtag";
226	$section = $doc_obj->get_next_section($section);
227	next;
228	}
229
230	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
231	foreach my $field (split (/;/, $fields)) {
232	# only deal with this field if it doesn't start with top or
233	# this is the first section
234	my $real_field = $field;
235	next if (($real_field =~ s/^top//) && ($doc_section != 1));
236
237	my $new_text = "";
238
239	# we get allfields by default - do nothing
240	if ($real_field eq "allfields") {
241
242	}
243
244	# metadata - output all metadata we know about except gsdl stuff
245	elsif ($real_field eq "metadata") {
246	my $shortname = "";
247	my $metadata = $doc_obj->get_all_metadata ($section);
248	foreach my $pair (@$metadata) {
249	my ($mfield, $mvalue) = (@$pair);
250	# check fields here, maybe others dont want - change to use dontindex!!
251	if ($mfield ne "Identifier"
252	&& $mfield !~ /^gsdl/
253	&& $mfield ne "classifytype"
254	&& $mfield ne "assocfilepath"
255	&& defined $mvalue && $mvalue ne "") {
256
257	if (defined $self->{'indexfieldmap'}->{$mfield}) {
258	$shortname = $self->{'indexfieldmap'}->{$mfield};
259	}
260	else {
261	$shortname = $self->create_shortname($mfield);
262	$self->{'indexfieldmap'}->{$mfield} = $shortname;
263	$self->{'indexfieldmap'}->{$shortname} = 1;
264	}
265	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
266	if (!defined $self->{'indexfields'}->{$mfield}) {
267	$self->{'indexfields'}->{$mfield} = 1;
268	}
269	}
270	}
271	}
272	else {
273	#individual metadata and or text specified - could be
274	# a comma separated list
275	my $shortname="";
276	if (defined $self->{'indexfieldmap'}->{$real_field}) {
277	$shortname = $self->{'indexfieldmap'}->{$real_field};
278	}
279	else {
280	$shortname = $self->create_shortname($real_field);
281	$self->{'indexfieldmap'}->{$real_field} = $shortname;
282	$self->{'indexfieldmap'}->{$shortname} = 1;
283	}
284	# we only want one tag around the index
285	$new_text .= "$paratag<$shortname>";
286	my @metadata_list = ();
287	foreach my $submeta (split /,/, $real_field) {
288	if ($submeta eq "text") {
289	my $section_text = $doc_obj->get_text($section);
290	if ($self->{'indexing_text'}) {
291	if ($paratag ne "") {
292	# we fiddle around with splitting text into paragraphs
293	$new_text .= "</$shortname>$paratag<$shortname>\n";
294	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
295	}
296	else {
297	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
298	}
299	$new_text .= "$section_text</$shortname><$shortname>\n";
300	}
301	else {
302	# leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
303	$new_text .= $section_text;
304	}
305	}
306	else {
307	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
308	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
309	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
310	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
311	}
312	}
313	push (@metadata_list, @section_metadata);
314	}
315	}
316	foreach my $item (@metadata_list) {
317	#$new_text .= "$paratag<$shortname>$item</$shortname>\n";
318	$new_text .= "$item ";
319	}
320	$new_text .= "</$shortname>";
321	}
322
323	# filter the text
324	$new_text = $self->filter_text ($field, $new_text);
325
326	$self->{'num_processed_bytes'} += length ($new_text);
327	$text .= "$new_text";
328	} # foreach field
329
330	$text .= "$sectionendtag";
331	$section = $doc_obj->get_next_section($section);
332	} # while defined section
333	print $handle "$text\n$documentendtag";
334
335	}
336
337	#chooses the first two letters or digits for the shortname
338	#now ignores non-letdig characters
339	sub create_shortname {
340	my $self = shift(@_);
341
342	my ($realname) = @_;
343	#take the first two chars
344	my $shortname;
345	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
346	$shortname = "$1$2";
347	} else {
348	# there aren't two letdig's in the field - try arbitrary combinations
349	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
350	$shortname = "AB";
351	}
352	$shortname =~ tr/a-z/A-Z/;
353
354	#if already used, take the first and third letdigs and so on
355	my $count = 1;
356	while (defined $self->{'indexfieldmap'}->{$shortname}) {
357	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
358	$shortname = "$1$3";
359	$count++;
360	$shortname =~ tr/a-z/A-Z/;
361
362	}
363	else {
364	#remove up to and incl the first letdig
365	$realname =~ s/^[^\w]*\w//;
366	$count = 0;
367	}
368	}
369
370	return $shortname;
371	}
372
373	1;
374

Note: See TracBrowser for help on using the repository browser.

Download in other formats: