Context Navigation

source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 21562

Last change on this file since 21562 was 20419, checked in by kjdon, 15 years ago
strip off ex. before retrieving metadata for indexing. ex. now valid in collect.cfg
Property svn:keywords set to `Author Date Id Revision`
File size: 14.1 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	# change this so a user can add their own ones in via a file or cfg
49	#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50	#also add the level names (Doc, Sec, Para)
51	our %static_indexfield_map = ('Title'=>'TI',
52	'TI'=>1,
53	'Subject'=>'SU',
54	'SU'=>1,
55	'Creator'=>'CR',
56	'CR'=>1,
57	'Organization'=>'ORG',
58	'ORG'=>1,
59	'Source'=>'SO',
60	'SO'=>1,
61	'Howto'=>'HT',
62	'HT'=>1,
63	'ItemTitle'=>'IT',
64	'IT'=>1,
65	'ProgNumber'=>'PN',
66	'PN'=>1,
67	'People'=>'PE',
68	'PE'=>1,
69	'Coverage'=>'CO',
70	'CO'=>1,
71	'allfields'=>'ZZ',
72	'ZZ'=>1,
73	'text'=>'TX',
74	'TX'=>1,
75	'AND'=>1,
76	'OR'=>1,
77	'NOT'=>1,
78	'NEAR'=>1,
79	'Doc'=>1,
80	'Sec'=>1,
81	'Para'=>1);
82
83
84	sub new {
85	my $class = shift @_;
86	my $self = new basebuildproc (@_);
87
88	# use a different index specification to the default
89	$self->{'index'} = "text";
90
91	$self->{'dontindex'} = {};
92	$self->{'indexfieldmap'} = {};
93	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
94	$self->{'strip_html'}=1;
95
96	return bless $self, $class;
97	}
98
99
100	#sub set_indexfieldmap {
101	# my $self = shift (@_);
102	# my ($indexmap) = @_;
103
104	# $self->{'default_index_field_mapping'} = $indexmap;
105	#$self->{'indexfieldmap'} = $indexmap;
106	#}
107
108	sub get_indexfieldmap {
109	my $self = shift (@_);
110
111	return $self->{'indexfieldmap'};
112	}
113
114	sub set_levels {
115	my $self = shift (@_);
116	my ($levels) = @_;
117
118	$self->{'levels'} = $levels;
119	}
120
121	sub set_strip_html {
122	my $self = shift (@_);
123	my ($strip) = @_;
124	$self->{'strip_html'}=$strip;
125	}
126
127	#sub find_paragraphs {
128	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
129	#}
130
131	sub remove_gtlt {
132	my $self =shift(@_);
133	my ($text, $para) = @_;
134	$text =~s/[<>]//g;
135	return "$para$text$para";
136	}
137
138	sub process_tags {
139	my $self = shift(@_);
140	my ($text, $para) = @_;
141	if ($text =~ /^p\b/i) {
142	return $para;
143	}
144	return "";
145	}
146
147	sub preprocess_text {
148	my $self = shift (@_);
149	my ($text, $strip_html, $para) = @_;
150	# at this stage, we do not do paragraph tags unless have strip_html -
151	# it will result in a huge mess of non-xml
152	return unless $strip_html;
153
154	my $new_text = $text;
155
156	# if we have <pre> tags, we can have < > inside them, need to delete
157	# the <> before stripping tags
158	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
159
160	if ($para eq "") {
161	# just remove all tags
162	$new_text =~ s/<[^>]*>//gs;
163	} else {
164	# strip all tags except <p> tags which get turned into $para
165	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
166
167	}
168	return $new_text;
169	}
170	#this function strips the html tags from the doc if ($strip_html) and
171	# if ($para) replaces <p> with <Paragraph> tags.
172	# if both are false, the original text is returned
173	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
174	#these tags
175	sub preprocess_text_old_and_slow {
176	my $self = shift (@_);
177	my ($text, $strip_html, $para) = @_;
178	my ($outtext) = "";
179	if ($strip_html) {
180	while ($text =~ /<([^>]*)>/ && $text ne "") {
181
182	my $tag = $1;
183	$outtext .= $`." "; #add everything before the matched tag
184	$text = $'; #'everything after the matched tag
185	if ($para && $tag =~ /^\s*p\s/i) {
186	$outtext .= $para;
187	}
188	elsif ($tag =~ /^pre$/) { # a pre tag
189	$text =~ /<\/pre>/; # find the closing pre tag
190	my $tmp_text = $`; #everything before the closing pre tag
191	$text = $'; #'everything after the </pre>
192	$tmp_text =~ s/[<>]//g; # remove all < and >
193	$outtext.= $tmp_text . " ";
194	}
195	}
196
197	$outtext .= $text; # add any remaining text
198	return $outtext;
199	} #if strip_html
200
201	#if ($para) {
202	#$text =~ s/(<p\b)/$para$1/gi;
203	#return $text;
204	# }
205	return $text;
206	}
207
208	sub text {
209	my $self = shift (@_);
210	my ($doc_obj) = @_;
211	my $handle = $self->{'output_handle'};
212	my $outhandle = $self->{'outhandle'};
213
214	# only output this document if it is one to be indexed
215	return if ($doc_obj->get_doc_type() ne "indexed_doc");
216
217	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
218
219	# this is another document
220	$self->{'num_docs'} += 1;
221
222	# get the parameters for the output
223	# split on : just in case there is subcoll and lang stuff
224	my ($fields) = split (/:/, $self->{'index'});
225
226	# we always do text and index on Doc and Sec levels
227	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
228	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
229	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
230	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
231
232	my ($paratag) = "";
233
234	# paragraph tags will only be used for indexing (can't retrieve
235	# paragraphs), and can ony be used if we are stripping HTML tags
236	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
237	if ($self->{'strip_html'}) {
238	$paratag = "<". $level_map{'paragraph'} . ">";
239	} else {
240	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
241	}
242	}
243
244	my $doc_section = 0; # just for this document
245
246	my $text = $documenttag;
247
248	# get the text for this document
249	my $section = $doc_obj->get_top_section();
250
251	while (defined $section) {
252	# update a few statistics
253	$doc_section++;
254	$self->{'num_sections'} += 1;
255	$text .= "$sectiontag";
256
257	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
258	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
259	# we are not actually indexing anything for this document,
260	# but we want to keep the section numbers the same, so we just
261	# output section tags for each section (which is done above)
262	$text .= "$sectionendtag";
263	$section = $doc_obj->get_next_section($section);
264	next;
265	}
266
267	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
268
269	# has the user added a 'metadata' index?
270	my $all_metadata_specified = 0;
271	# which fields have already been indexed? (same as fields, but in a map)
272	my $specified_fields = {};
273	foreach my $field (split (/;/, $fields)) {
274	# only deal with this field if it doesn't start with top or
275	# this is the first section
276	my $real_field = $field;
277	next if (($real_field =~ s/^top//) && ($doc_section != 1));
278
279	my $new_text = "";
280
281	# we get allfields by default - do nothing except add into the map
282	if ($real_field eq "allfields") {
283	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
284	$self->{'indexfieldmap'}->{"ZZ"} = 1;
285	}
286
287	# metadata - output all metadata we know about except gsdl stuff
288	# each metadata is in a separate index field
289	if ($real_field eq "metadata") {
290	# we will process this later, so we are not reindexing metadata already indexed
291	$all_metadata_specified = 1;
292	}
293
294	else {
295
296	#individual metadata and or text specified - could be
297	# a comma separated list
298	$specified_fields->{$real_field} = 1;
299	my $shortname="";
300	my $new_field = 0; # have we found a new field name?
301
302	if (defined $self->{'indexfieldmap'}->{$real_field}) {
303	$shortname = $self->{'indexfieldmap'}->{$real_field};
304	}
305	else {
306	$shortname = $self->create_shortname($real_field);
307	$new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values
308	}
309	my @metadata_list = (); # put any meta values in here
310	my $section_text = ""; # put any text in here
311	foreach my $submeta (split /,/, $real_field) {
312	if ($submeta eq "text") {
313	# no point in indexing text more than once
314	if ($section_text eq "") {
315	$section_text = $doc_obj->get_text($section);
316	if ($self->{'indexing_text'}) {
317	if ($paratag ne "") {
318	# we fiddle around with splitting text into paragraphs
319	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
320	}
321	else {
322	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
323	}
324	}
325	}
326	}
327	else {
328	$submeta =~ s/^ex\.//; #strip off ex.
329	# its a metadata element
330	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
331	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
332	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
333	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
334	}
335	}
336	push (@metadata_list, @section_metadata);
337	}
338	} # for each field in index
339
340
341	# now we add the text and/or the metadata into new_text
342	if ($section_text ne "" \|\| scalar(@metadata_list)) {
343	if ($self->{'indexing_text'}) {
344	# only add tags in if indexing
345	$new_text .= "$paratag<$shortname>";
346	}
347	if ($section_text ne "") {
348	$new_text .= "$section_text ";
349	if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
350	$new_text .= "</$shortname>$paratag<$shortname>";
351	}
352	}
353	foreach my $item (@metadata_list) {
354	$new_text .= "$item ";
355	}
356	if ($self->{'indexing_text'}) {
357	# only add tags in if indexing
358	$new_text .= "</$shortname>";
359	}
360	if ($self->{'indexing_text'} && $new_field) {
361	# we need to add to the list in indexfields
362
363	$self->{'indexfieldmap'}->{$real_field} = $shortname;
364	$self->{'indexfieldmap'}->{$shortname} = 1;
365	}
366	}
367	}
368
369	# filter the text
370	$new_text = $self->filter_text ($field, $new_text);
371
372	$self->{'num_processed_bytes'} += length ($new_text);
373	$text .= "$new_text";
374	} # foreach field
375
376	if ($all_metadata_specified) {
377	my $new_text = "";
378	my $shortname = "";
379	my $metadata = $doc_obj->get_all_metadata ($section);
380	foreach my $pair (@$metadata) {
381	my ($mfield, $mvalue) = (@$pair);
382	# no value
383	next unless defined $mvalue && $mvalue ne "";
384	# we have already indexed this
385	next if defined ($specified_fields->{$mfield});
386	# check fields here, maybe others dont want - change to use dontindex!!
387	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
388	next if ($mfield =~ /^gsdl/);
389
390
391	if (defined $self->{'indexfieldmap'}->{$mfield}) {
392	$shortname = $self->{'indexfieldmap'}->{$mfield};
393	}
394	else {
395	$shortname = $self->create_shortname($mfield);
396	$self->{'indexfieldmap'}->{$mfield} = $shortname;
397	$self->{'indexfieldmap'}->{$shortname} = 1;
398	}
399	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
400	if (!defined $self->{'indexfields'}->{$mfield}) {
401	$self->{'indexfields'}->{$mfield} = 1;
402	}
403
404	}
405	# filter the text
406	$new_text = $self->filter_text ("metadata", $new_text);
407
408	$self->{'num_processed_bytes'} += length ($new_text);
409	$text .= "$new_text";
410
411
412	}
413
414	$text .= "$sectionendtag";
415	$section = $doc_obj->get_next_section($section);
416	} # while defined section
417	print $handle "$text\n$documentendtag";
418	#print STDERR "*********\n$text\n*************\n";
419
420	}
421
422	#chooses the first two letters or digits for the shortname
423	#now ignores non-letdig characters
424	sub create_shortname {
425	my $self = shift(@_);
426
427	my ($realname) = @_;
428	# try our predefined static mapping
429	if (defined $static_indexfield_map{$realname}) {
430	return $static_indexfield_map{$realname};
431	}
432	#try the first two chars
433	my $shortname;
434	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
435	$shortname = "$1$2";
436	} else {
437	# there aren't two letdig's in the field - try arbitrary combinations
438	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
439	$shortname = "AB";
440	}
441	$shortname =~ tr/a-z/A-Z/;
442
443	#if already used, take the first and third letdigs and so on
444	my $count = 1;
445	while (defined $self->{'indexfieldmap'}->{$shortname} \|\| defined $static_indexfield_map{$shortname}) {
446	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
447	$shortname = "$1$3";
448	$count++;
449	$shortname =~ tr/a-z/A-Z/;
450
451	}
452	else {
453	#remove up to and incl the first letdig
454	$realname =~ s/^[^\w]*\w//;
455	$count = 0;
456	}
457	}
458
459	return $shortname;
460	}
461
462	1;
463

Note: See TracBrowser for help on using the repository browser.

Download in other formats: