Context Navigation

source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 31861

Last change on this file since 31861 was 27328, checked in by kjdon, 11 years ago
changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	# change this so a user can add their own ones in via a file or cfg
49	#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50	#also add the level names (Doc, Sec, Para)
51	our %static_indexfield_map = ('Title'=>'TI',
52	'TI'=>1,
53	'Subject'=>'SU',
54	'SU'=>1,
55	'Creator'=>'CR',
56	'CR'=>1,
57	'Organization'=>'ORG',
58	'ORG'=>1,
59	'Source'=>'SO',
60	'SO'=>1,
61	'Howto'=>'HT',
62	'HT'=>1,
63	'ItemTitle'=>'IT',
64	'IT'=>1,
65	'ProgNumber'=>'PN',
66	'PN'=>1,
67	'People'=>'PE',
68	'PE'=>1,
69	'Coverage'=>'CO',
70	'CO'=>1,
71	'allfields'=>'ZZ',
72	'ZZ'=>1,
73	'text'=>'TX',
74	'TX'=>1,
75	'AND'=>1,
76	'OR'=>1,
77	'NOT'=>1,
78	'NEAR'=>1,
79	'Doc'=>1,
80	'Sec'=>1,
81	'Para'=>1);
82
83
84	sub new {
85	my $class = shift @_;
86	my $self = new basebuildproc (@_);
87
88	# use a different index specification to the default
89	$self->{'index'} = "text";
90
91	$self->{'dontindex'} = {};
92	$self->{'allindexfields'} = {}; # list of all actually indexed fields
93	$self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
94	$self->{'fieldnamemap'} = {'allfields'=>'ZZ',
95	'ZZ'=>1,
96	'text'=>'TX',
97	'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
98	$self->{'strip_html'}=1;
99
100	return bless $self, $class;
101	}
102
103	sub set_levels {
104	my $self = shift (@_);
105	my ($levels) = @_;
106
107	$self->{'levels'} = $levels;
108	}
109
110	sub set_strip_html {
111	my $self = shift (@_);
112	my ($strip) = @_;
113	$self->{'strip_html'}=$strip;
114	}
115
116	#sub find_paragraphs {
117	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
118	#}
119
120	sub remove_gtlt {
121	my $self =shift(@_);
122	my ($text, $para) = @_;
123	$text =~s/[<>]//g;
124	return "$para$text$para";
125	}
126
127	sub process_tags {
128	my $self = shift(@_);
129	my ($text, $para) = @_;
130	if ($text =~ /^p\b/i) {
131	return $para;
132	}
133	return "";
134	}
135
136	sub preprocess_text {
137	my $self = shift (@_);
138	my ($text, $strip_html, $para) = @_;
139	# at this stage, we do not do paragraph tags unless have strip_html -
140	# it will result in a huge mess of non-xml
141	return unless $strip_html;
142
143	my $new_text = $text;
144
145	# if we have <pre> tags, we can have < > inside them, need to delete
146	# the <> before stripping tags
147	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
148
149	if ($para eq "") {
150	# just remove all tags
151	$new_text =~ s/<[^>]*>//gs;
152	} else {
153	# strip all tags except <p> tags which get turned into $para
154	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
155
156	}
157	return $new_text;
158	}
159	#this function strips the html tags from the doc if ($strip_html) and
160	# if ($para) replaces <p> with <Paragraph> tags.
161	# if both are false, the original text is returned
162	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
163	#these tags
164	sub preprocess_text_old_and_slow {
165	my $self = shift (@_);
166	my ($text, $strip_html, $para) = @_;
167	my ($outtext) = "";
168	if ($strip_html) {
169	while ($text =~ /<([^>]*)>/ && $text ne "") {
170
171	my $tag = $1;
172	$outtext .= $`." "; #add everything before the matched tag
173	$text = $'; #'everything after the matched tag
174	if ($para && $tag =~ /^\s*p\s/i) {
175	$outtext .= $para;
176	}
177	elsif ($tag =~ /^pre$/) { # a pre tag
178	$text =~ /<\/pre>/; # find the closing pre tag
179	my $tmp_text = $`; #everything before the closing pre tag
180	$text = $'; #'everything after the </pre>
181	$tmp_text =~ s/[<>]//g; # remove all < and >
182	$outtext.= $tmp_text . " ";
183	}
184	}
185
186	$outtext .= $text; # add any remaining text
187	return $outtext;
188	} #if strip_html
189
190	#if ($para) {
191	#$text =~ s/(<p\b)/$para$1/gi;
192	#return $text;
193	# }
194	return $text;
195	}
196
197	sub text {
198	my $self = shift (@_);
199	my ($doc_obj) = @_;
200	my $handle = $self->{'output_handle'};
201	my $outhandle = $self->{'outhandle'};
202
203	# only output this document if it is one to be indexed
204	return if ($doc_obj->get_doc_type() ne "indexed_doc");
205
206	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
207
208	# this is another document
209	$self->{'num_docs'} += 1;
210
211	# get the parameters for the output
212	# split on : just in case there is subcoll and lang stuff
213	my ($fields) = split (/:/, $self->{'index'});
214
215	# we always do text and index on Doc and Sec levels
216	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
217	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
218	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
219	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
220
221	my ($paratag) = "";
222
223	# paragraph tags will only be used for indexing (can't retrieve
224	# paragraphs), and can ony be used if we are stripping HTML tags
225	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
226	if ($self->{'strip_html'}) {
227	$paratag = "<". $level_map{'paragraph'} . ">";
228	} else {
229	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
230	}
231	}
232
233	my $doc_section = 0; # just for this document
234
235	my $text = $documenttag;
236
237	# get the text for this document
238	my $section = $doc_obj->get_top_section();
239
240	while (defined $section) {
241	# update a few statistics
242	$doc_section++;
243	$self->{'num_sections'} += 1;
244	$text .= "$sectiontag";
245
246	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
247	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
248	# we are not actually indexing anything for this document,
249	# but we want to keep the section numbers the same, so we just
250	# output section tags for each section (which is done above)
251	$text .= "$sectionendtag";
252	$section = $doc_obj->get_next_section($section);
253	next;
254	}
255
256	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
257
258	# has the user added a 'metadata' index?
259	my $all_metadata_specified = 0;
260	# which fields have already been indexed? (same as fields, but in a map)
261	my $specified_fields = {};
262	foreach my $field (split (/;/, $fields)) {
263	# only deal with this field if it doesn't start with top or
264	# this is the first section
265	my $real_field = $field;
266	next if (($real_field =~ s/^top//) && ($doc_section != 1));
267
268	my $new_text = "";
269
270	# we get allfields by default
271	next if ($real_field eq "allfields");
272
273	# metadata - output all metadata we know about except gsdl stuff
274	# each metadata is in a separate index field
275	if ($real_field eq "metadata") {
276	# we will process this later, so we are not reindexing metadata already indexed
277	$all_metadata_specified = 1;
278	next;
279	}
280
281	#individual metadata and or text specified - could be
282	# a comma separated list
283	$specified_fields->{$real_field} = 1;
284	my $shortname="";
285
286	if (defined $self->{'fieldnamemap'}->{$real_field}) {
287	$shortname = $self->{'fieldnamemap'}->{$real_field};
288	} else {
289	$shortname = $self->create_shortname($real_field);
290	$self->{'fieldnamemap'}->{$real_field} = $shortname;
291	$self->{'fieldnamemap'}->{$shortname} = 1;
292	}
293
294	my @metadata_list = (); # put any meta values in here
295	my $section_text = ""; # put any text in here
296	foreach my $submeta (split /,/, $real_field) {
297	if ($submeta eq "text") {
298	# no point in indexing text more than once
299	if ($section_text eq "") {
300	$section_text = $doc_obj->get_text($section);
301	if ($self->{'indexing_text'}) {
302	if ($paratag ne "") {
303	# we fiddle around with splitting text into paragraphs
304	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
305	}
306	else {
307	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
308	}
309	}
310	}
311	}
312	else {
313	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
314	# its a metadata element
315	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
316	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
317	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
318	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
319	}
320	}
321	push (@metadata_list, @section_metadata);
322	}
323	} # for each field in index
324
325
326	# now we add the text and/or the metadata into new_text
327	if ($section_text ne "" \|\| scalar(@metadata_list)) {
328	if ($self->{'indexing_text'}) {
329	# only add tags in if indexing
330	$new_text .= "$paratag<$shortname>";
331	}
332	if ($section_text ne "") {
333	$new_text .= "$section_text ";
334	if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
335	$new_text .= "</$shortname>$paratag<$shortname>";
336	}
337	}
338	foreach my $item (@metadata_list) {
339	$new_text .= "$item ";
340	}
341	if ($self->{'indexing_text'}) {
342	# only add tags in if indexing
343	$new_text .= "</$shortname>";
344	$self->{'allindexfields'}->{$real_field} = 1;
345	}
346	}
347
348	# filter the text
349	$new_text = $self->filter_text ($field, $new_text);
350
351	$self->{'num_processed_bytes'} += length ($new_text);
352	$text .= "$new_text";
353	} # foreach field
354
355	if ($all_metadata_specified) {
356	my $new_text = "";
357	my $shortname = "";
358	my $metadata = $doc_obj->get_all_metadata ($section);
359	foreach my $pair (@$metadata) {
360	my ($mfield, $mvalue) = (@$pair);
361	# no value
362	next unless defined $mvalue && $mvalue ne "";
363	# we have already indexed this
364	next if defined ($specified_fields->{$mfield});
365	# check fields here, maybe others dont want - change to use dontindex!!
366	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
367	next if ($mfield =~ /^gsdl/);
368
369	if (defined $self->{'fieldnamemap'}->{$mfield}) {
370	$shortname = $self->{'fieldnamemap'}->{$mfield};
371	} else {
372	$shortname = $self->create_shortname($mfield);
373	$self->{'fieldnamemap'}->{$mfield} = $shortname;
374	$self->{'fieldnamemap'}->{$shortname} = 1;
375	}
376	$self->{'allindexfields'}->{$mfield} = 1;
377	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
378	if (!defined $self->{'extraindexfields'}->{$mfield}) {
379	$self->{'extraindexfields'}->{$mfield} = 1;
380	}
381
382	}
383	# filter the text
384	$new_text = $self->filter_text ("metadata", $new_text);
385
386	$self->{'num_processed_bytes'} += length ($new_text);
387	$text .= "$new_text";
388
389
390	}
391
392	$text .= "$sectionendtag";
393	$section = $doc_obj->get_next_section($section);
394	} # while defined section
395	print $handle "$text\n$documentendtag";
396	#print STDERR "*********\n$text\n*************\n";
397
398	}
399
400	#chooses the first two letters or digits for the shortname
401	#now ignores non-letdig characters
402	sub create_shortname {
403	my $self = shift(@_);
404
405	my ($realname) = @_;
406	my @realnamelist = split(",", $realname);
407	map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
408	my ($singlename) = $realnamelist[0];
409
410	# try our predefined static mapping
411	my $name;
412	if (defined ($name = $static_indexfield_map{$singlename})) {
413	if (! defined $self->{'fieldnamemap'}->{$name}) {
414	# has this shortname already been used??
415	return $static_indexfield_map{$singlename};
416	}
417	}
418	# we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
419	$realname = join ("", @realnamelist);
420	#try the first two chars
421	my $shortname;
422	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
423	$shortname = "$1$2";
424	} else {
425	# there aren't two letdig's in the field - try arbitrary combinations
426	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
427	$shortname = "AB";
428	}
429	$shortname =~ tr/a-z/A-Z/;
430
431	#if already used, take the first and third letdigs and so on
432	my $count = 1;
433	while (defined $self->{'fieldnamemap'}->{$shortname} \|\| defined $static_indexfield_map{$shortname}) {
434	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
435	$shortname = "$1$3";
436	$count++;
437	$shortname =~ tr/a-z/A-Z/;
438
439	}
440	else {
441	#remove up to and incl the first letdig
442	$realname =~ s/^[^\w]*\w//;
443	$count = 0;
444	}
445	}
446
447	return $shortname;
448	}
449
450	1;
451

Note: See TracBrowser for help on using the repository browser.

Download in other formats: