Context Navigation

source: gsdl/trunk/perllib/mgppbuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago
changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc
Property svn:keywords set to `Author Date Id Revision`
File size: 11.4 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	sub new {
49	my $class = shift @_;
50	my $self = new basebuildproc (@_);
51
52	# use a different index specification to the default
53	$self->{'index'} = "text";
54
55	$self->{'dontindex'} = {};
56	$self->{'indexfieldmap'} = {};
57	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
58	$self->{'strip_html'}=1;
59
60	return bless $self, $class;
61	}
62
63
64	sub set_indexfieldmap {
65	my $self = shift (@_);
66	my ($indexmap) = @_;
67
68	$self->{'indexfieldmap'} = $indexmap;
69	}
70
71	sub get_indexfieldmap {
72	my $self = shift (@_);
73
74	return $self->{'indexfieldmap'};
75	}
76
77	sub set_levels {
78	my $self = shift (@_);
79	my ($levels) = @_;
80
81	$self->{'levels'} = $levels;
82	}
83
84	sub set_strip_html {
85	my $self = shift (@_);
86	my ($strip) = @_;
87	$self->{'strip_html'}=$strip;
88	}
89
90	#sub find_paragraphs {
91	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
92	#}
93
94	sub remove_gtlt {
95	my $self =shift(@_);
96	my ($text, $para) = @_;
97	$text =~s/[<>]//g;
98	return "$para$text$para";
99	}
100
101	sub process_tags {
102	my $self = shift(@_);
103	my ($text, $para) = @_;
104	if ($text =~ /^p\b/i) {
105	return $para;
106	}
107	return "";
108	}
109
110	sub preprocess_text {
111	my $self = shift (@_);
112	my ($text, $strip_html, $para) = @_;
113	# at this stage, we do not do paragraph tags unless have strip_html -
114	# it will result in a huge mess of non-xml
115	return unless $strip_html;
116
117	my $new_text = $text;
118
119	# if we have <pre> tags, we can have < > inside them, need to delete
120	# the <> before stripping tags
121	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
122
123	if ($para eq "") {
124	# just remove all tags
125	$new_text =~ s/<[^>]*>//gs;
126	} else {
127	# strip all tags except <p> tags which get turned into $para
128	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
129
130	}
131	return $new_text;
132	}
133	#this function strips the html tags from the doc if ($strip_html) and
134	# if ($para) replaces <p> with <Paragraph> tags.
135	# if both are false, the original text is returned
136	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
137	#these tags
138	sub preprocess_text_old_and_slow {
139	my $self = shift (@_);
140	my ($text, $strip_html, $para) = @_;
141	my ($outtext) = "";
142	if ($strip_html) {
143	while ($text =~ /<([^>]*)>/ && $text ne "") {
144
145	my $tag = $1;
146	$outtext .= $`." "; #add everything before the matched tag
147	$text = $'; #'everything after the matched tag
148	if ($para && $tag =~ /^\s*p\s/i) {
149	$outtext .= $para;
150	}
151	elsif ($tag =~ /^pre$/) { # a pre tag
152	$text =~ /<\/pre>/; # find the closing pre tag
153	my $tmp_text = $`; #everything before the closing pre tag
154	$text = $'; #'everything after the </pre>
155	$tmp_text =~ s/[<>]//g; # remove all < and >
156	$outtext.= $tmp_text . " ";
157	}
158	}
159
160	$outtext .= $text; # add any remaining text
161	return $outtext;
162	} #if strip_html
163
164	#if ($para) {
165	#$text =~ s/(<p\b)/$para$1/gi;
166	#return $text;
167	# }
168	return $text;
169	}
170
171	sub text {
172	my $self = shift (@_);
173	my ($doc_obj) = @_;
174	my $handle = $self->{'output_handle'};
175	my $outhandle = $self->{'outhandle'};
176
177	# only output this document if it is one to be indexed
178	return if ($doc_obj->get_doc_type() ne "indexed_doc");
179
180	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
181
182	# this is another document
183	$self->{'num_docs'} += 1;
184
185	# get the parameters for the output
186	# split on : just in case there is subcoll and lang stuff
187	my ($fields) = split (/:/, $self->{'index'});
188
189	# we always do text and index on Doc and Sec levels
190	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
191	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
192	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
193	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
194
195	my ($paratag) = "";
196
197	# paragraph tags will only be used for indexing (can't retrieve
198	# paragraphs), and can ony be used if we are stripping HTML tags
199	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
200	if ($self->{'strip_html'}) {
201	$paratag = "<". $level_map{'paragraph'} . ">";
202	} else {
203	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
204	}
205	}
206
207	my $doc_section = 0; # just for this document
208
209	my $text = $documenttag;
210
211	# get the text for this document
212	my $section = $doc_obj->get_top_section();
213
214	while (defined $section) {
215	# update a few statistics
216	$doc_section++;
217	$self->{'num_sections'} += 1;
218	$text .= "$sectiontag";
219
220	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
221	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
222	# we are not actually indexing anything for this document,
223	# but we want to keep the section numbers the same, so we just
224	# output section tags for each section (which is done above)
225	$text .= "$sectionendtag";
226	$section = $doc_obj->get_next_section($section);
227	next;
228	}
229
230	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
231	foreach my $field (split (/;/, $fields)) {
232	# only deal with this field if it doesn't start with top or
233	# this is the first section
234	my $real_field = $field;
235	next if (($real_field =~ s/^top//) && ($doc_section != 1));
236
237	my $new_text = "";
238
239	# we get allfields by default - do nothing
240	if ($real_field eq "allfields") {
241
242	}
243
244	# metadata - output all metadata we know about except gsdl stuff
245	elsif ($real_field eq "metadata") {
246	my $shortname = "";
247	my $metadata = $doc_obj->get_all_metadata ($section);
248	foreach my $pair (@$metadata) {
249	my ($mfield, $mvalue) = (@$pair);
250	# check fields here, maybe others dont want - change to use dontindex!!
251	if ($mfield ne "Identifier"
252	&& $mfield !~ /^gsdl/
253	&& $mfield ne "classifytype"
254	&& $mfield ne "assocfilepath"
255	&& defined $mvalue && $mvalue ne "") {
256
257	if (defined $self->{'indexfieldmap'}->{$mfield}) {
258	$shortname = $self->{'indexfieldmap'}->{$mfield};
259	}
260	else {
261	$shortname = $self->create_shortname($mfield);
262	$self->{'indexfieldmap'}->{$mfield} = $shortname;
263	$self->{'indexfieldmap'}->{$shortname} = 1;
264	}
265	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
266	if (!defined $self->{'indexfields'}->{$mfield}) {
267	$self->{'indexfields'}->{$mfield} = 1;
268	}
269	}
270	}
271	}
272	else {
273	#individual metadata and or text specified - could be
274	# a comma separated list
275	my $shortname="";
276	if (defined $self->{'indexfieldmap'}->{$real_field}) {
277	$shortname = $self->{'indexfieldmap'}->{$real_field};
278	}
279	else {
280	$shortname = $self->create_shortname($real_field);
281	$self->{'indexfieldmap'}->{$real_field} = $shortname;
282	$self->{'indexfieldmap'}->{$shortname} = 1;
283	}
284	my @metadata_list = ();
285	foreach my $submeta (split /,/, $real_field) {
286	if ($submeta eq "text") {
287	my $section_text = $doc_obj->get_text($section);
288	if ($self->{'indexing_text'}) {
289	# tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
290	$new_text .= "$paratag<$shortname>\n";
291	if ($paratag ne "") {
292	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
293	}
294	else {
295	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
296	}
297	$new_text .= "$section_text</$shortname>\n";
298	}
299	else {
300	# leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
301	$new_text .= $section_text;
302	}
303	}
304	else {
305	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
306	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
307	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
308	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
309	}
310	}
311	push (@metadata_list, @section_metadata);
312	}
313	}
314	foreach my $item (@metadata_list) {
315	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
316	}
317	}
318
319	# filter the text
320	$new_text = $self->filter_text ($field, $new_text);
321
322	$self->{'num_processed_bytes'} += length ($new_text);
323	$text .= "$new_text";
324	} # foreach field
325
326	$text .= "$sectionendtag";
327	$section = $doc_obj->get_next_section($section);
328	} # while defined section
329	print $handle "$text\n$documentendtag";
330
331	}
332
333	#chooses the first two letters or digits for the shortname
334	#now ignores non-letdig characters
335	sub create_shortname {
336	my $self = shift(@_);
337
338	my ($realname) = @_;
339	#take the first two chars
340	my $shortname;
341	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
342	$shortname = "$1$2";
343	} else {
344	# there aren't two letdig's in the field - try arbitrary combinations
345	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
346	$shortname = "AB";
347	}
348	$shortname =~ tr/a-z/A-Z/;
349
350	#if already used, take the first and third letdigs and so on
351	my $count = 1;
352	while (defined $self->{'indexfieldmap'}->{$shortname}) {
353	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
354	$shortname = "$1$3";
355	$count++;
356	$shortname =~ tr/a-z/A-Z/;
357
358	}
359	else {
360	#remove up to and incl the first letdig
361	$realname =~ s/^[^\w]*\w//;
362	$count = 0;
363	}
364	}
365
366	return $shortname;
367	}
368
369	1;
370

Note: See TracBrowser for help on using the repository browser.

Download in other formats: