Context Navigation

source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 33235

Last change on this file since 33235 was 33144, checked in by wy59, 5 years ago
Related to commit 33128 where we claimed we had reserved CD, CS index short names for Coordinate and CoordShort in Greenstone perl code. We hadn't yet because we couldn't see Latitude/LatShort and Longitude/LngShort reserved yet and weren't sure if we needed to add it in. With Dr Bainbridge's confirmation that we were on the right track, we've now reserved all of these index short names (CD,CS,LT,LO,LA,LN).
Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	# change this so a user can add their own ones in via a file or cfg
49	#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50	#also add the level names (Doc, Sec, Para)
51	our %static_indexfield_map = ('Title'=>'TI',
52	'TI'=>1,
53	'Subject'=>'SU',
54	'SU'=>1,
55	'Creator'=>'CR',
56	'CR'=>1,
57	'Organization'=>'ORG',
58	'ORG'=>1,
59	'Source'=>'SO',
60	'SO'=>1,
61	'Howto'=>'HT',
62	'HT'=>1,
63	'ItemTitle'=>'IT',
64	'IT'=>1,
65	'ProgNumber'=>'PN',
66	'PN'=>1,
67	'People'=>'PE',
68	'PE'=>1,
69	'Coverage'=>'CO',
70	'CO'=>1,
71	'allfields'=>'ZZ',
72	'ZZ'=>1,
73	'text'=>'TX',
74	'TX'=>1,
75	'Coordinate'=>'CD',
76	'CD'=>1,
77	'CoordShort'=>'CS',
78	'CS'=>1,
79	'Latitude'=>'LT',
80	'LT'=>1,
81	'Longitude'=>'LO',
82	'LO'=>1,
83	'LatShort'=>'LA',
84	'LA'=>1,
85	'LngShort'=>'LN',
86	'LN'=>1,
87	'AND'=>1,
88	'OR'=>1,
89	'NOT'=>1,
90	'NEAR'=>1,
91	'Doc'=>1,
92	'Sec'=>1,
93	'Para'=>1);
94
95
96	sub new {
97	my $class = shift @_;
98	my $self = new basebuildproc (@_);
99
100	# use a different index specification to the default
101	$self->{'index'} = "text";
102
103	$self->{'dontindex'} = {};
104	$self->{'allindexfields'} = {}; # list of all actually indexed fields
105	$self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
106	$self->{'fieldnamemap'} = {'allfields'=>'ZZ',
107	'ZZ'=>1,
108	'text'=>'TX',
109	'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
110	$self->{'strip_html'}=1;
111
112	return bless $self, $class;
113	}
114
115	sub set_levels {
116	my $self = shift (@_);
117	my ($levels) = @_;
118
119	$self->{'levels'} = $levels;
120	}
121
122	sub set_strip_html {
123	my $self = shift (@_);
124	my ($strip) = @_;
125	$self->{'strip_html'}=$strip;
126	}
127
128	#sub find_paragraphs {
129	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
130	#}
131
132	sub remove_gtlt {
133	my $self =shift(@_);
134	my ($text, $para) = @_;
135	$text =~s/[<>]//g;
136	return "$para$text$para";
137	}
138
139	sub process_tags {
140	my $self = shift(@_);
141	my ($text, $para) = @_;
142	if ($text =~ /^p\b/i) {
143	return $para;
144	}
145	return "";
146	}
147
148	sub preprocess_text {
149	my $self = shift (@_);
150	my ($text, $strip_html, $para) = @_;
151	# at this stage, we do not do paragraph tags unless have strip_html -
152	# it will result in a huge mess of non-xml
153	return unless $strip_html;
154
155	my $new_text = $text;
156
157	# if we have <pre> tags, we can have < > inside them, need to delete
158	# the <> before stripping tags
159	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
160
161	if ($para eq "") {
162	# just remove all tags
163	$new_text =~ s/<[^>]*>//gs;
164	} else {
165	# strip all tags except <p> tags which get turned into $para
166	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
167
168	}
169	return $new_text;
170	}
171	#this function strips the html tags from the doc if ($strip_html) and
172	# if ($para) replaces <p> with <Paragraph> tags.
173	# if both are false, the original text is returned
174	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
175	#these tags
176	sub preprocess_text_old_and_slow {
177	my $self = shift (@_);
178	my ($text, $strip_html, $para) = @_;
179	my ($outtext) = "";
180	if ($strip_html) {
181	while ($text =~ /<([^>]*)>/ && $text ne "") {
182
183	my $tag = $1;
184	$outtext .= $`." "; #add everything before the matched tag
185	$text = $'; #'everything after the matched tag
186	if ($para && $tag =~ /^\s*p\s/i) {
187	$outtext .= $para;
188	}
189	elsif ($tag =~ /^pre$/) { # a pre tag
190	$text =~ /<\/pre>/; # find the closing pre tag
191	my $tmp_text = $`; #everything before the closing pre tag
192	$text = $'; #'everything after the </pre>
193	$tmp_text =~ s/[<>]//g; # remove all < and >
194	$outtext.= $tmp_text . " ";
195	}
196	}
197
198	$outtext .= $text; # add any remaining text
199	return $outtext;
200	} #if strip_html
201
202	#if ($para) {
203	#$text =~ s/(<p\b)/$para$1/gi;
204	#return $text;
205	# }
206	return $text;
207	}
208
209	sub text {
210	my $self = shift (@_);
211	my ($doc_obj) = @_;
212	my $handle = $self->{'output_handle'};
213	my $outhandle = $self->{'outhandle'};
214
215	# only output this document if it is one to be indexed
216	return if ($doc_obj->get_doc_type() ne "indexed_doc");
217
218	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
219
220	# this is another document
221	$self->{'num_docs'} += 1;
222
223	# get the parameters for the output
224	# split on : just in case there is subcoll and lang stuff
225	my ($fields) = split (/:/, $self->{'index'});
226
227	# we always do text and index on Doc and Sec levels
228	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
229	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
230	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
231	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
232
233	my ($paratag) = "";
234
235	# paragraph tags will only be used for indexing (can't retrieve
236	# paragraphs), and can ony be used if we are stripping HTML tags
237	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
238	if ($self->{'strip_html'}) {
239	$paratag = "<". $level_map{'paragraph'} . ">";
240	} else {
241	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
242	}
243	}
244
245	my $doc_section = 0; # just for this document
246
247	my $text = $documenttag;
248
249	# get the text for this document
250	my $section = $doc_obj->get_top_section();
251
252	while (defined $section) {
253	# update a few statistics
254	$doc_section++;
255	$self->{'num_sections'} += 1;
256	$text .= "$sectiontag";
257
258	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
259	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
260	# we are not actually indexing anything for this document,
261	# but we want to keep the section numbers the same, so we just
262	# output section tags for each section (which is done above)
263	$text .= "$sectionendtag";
264	$section = $doc_obj->get_next_section($section);
265	next;
266	}
267
268	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
269
270	# has the user added a 'metadata' index?
271	my $all_metadata_specified = 0;
272	# which fields have already been indexed? (same as fields, but in a map)
273	my $specified_fields = {};
274	foreach my $field (split (/;/, $fields)) {
275	# only deal with this field if it doesn't start with top or
276	# this is the first section
277	my $real_field = $field;
278	next if (($real_field =~ s/^top//) && ($doc_section != 1));
279
280	my $new_text = "";
281
282	# we get allfields by default
283	next if ($real_field eq "allfields");
284
285	# metadata - output all metadata we know about except gsdl stuff
286	# each metadata is in a separate index field
287	if ($real_field eq "metadata") {
288	# we will process this later, so we are not reindexing metadata already indexed
289	$all_metadata_specified = 1;
290	next;
291	}
292
293	#individual metadata and or text specified - could be
294	# a comma separated list
295	$specified_fields->{$real_field} = 1;
296	my $shortname="";
297
298	if (defined $self->{'fieldnamemap'}->{$real_field}) {
299	$shortname = $self->{'fieldnamemap'}->{$real_field};
300	} else {
301	$shortname = $self->create_shortname($real_field);
302	$self->{'fieldnamemap'}->{$real_field} = $shortname;
303	$self->{'fieldnamemap'}->{$shortname} = 1;
304	}
305
306	my @metadata_list = (); # put any meta values in here
307	my $section_text = ""; # put any text in here
308	foreach my $submeta (split /,/, $real_field) {
309	if ($submeta eq "text") {
310	# no point in indexing text more than once
311	if ($section_text eq "") {
312	$section_text = $doc_obj->get_text($section);
313	if ($self->{'indexing_text'}) {
314	if ($paratag ne "") {
315	# we fiddle around with splitting text into paragraphs
316	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
317	}
318	else {
319	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
320	}
321	}
322	}
323	}
324	else {
325	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
326	# its a metadata element
327	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
328	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
329	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
330	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
331	}
332	}
333	push (@metadata_list, @section_metadata);
334	}
335	} # for each field in index
336
337
338	# now we add the text and/or the metadata into new_text
339	if ($section_text ne "" \|\| scalar(@metadata_list)) {
340	if ($self->{'indexing_text'}) {
341	# only add tags in if indexing
342	$new_text .= "$paratag<$shortname>";
343	}
344	if ($section_text ne "") {
345	$new_text .= "$section_text ";
346	if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
347	$new_text .= "</$shortname>$paratag<$shortname>";
348	}
349	}
350	foreach my $item (@metadata_list) {
351	$new_text .= "$item ";
352	}
353	if ($self->{'indexing_text'}) {
354	# only add tags in if indexing
355	$new_text .= "</$shortname>";
356	$self->{'allindexfields'}->{$real_field} = 1;
357	}
358	}
359
360	# filter the text
361	$new_text = $self->filter_text ($field, $new_text);
362
363	$self->{'num_processed_bytes'} += length ($new_text);
364	$text .= "$new_text";
365	} # foreach field
366
367	if ($all_metadata_specified) {
368	my $new_text = "";
369	my $shortname = "";
370	my $metadata = $doc_obj->get_all_metadata ($section);
371	foreach my $pair (@$metadata) {
372	my ($mfield, $mvalue) = (@$pair);
373	# no value
374	next unless defined $mvalue && $mvalue ne "";
375	# we have already indexed this
376	next if defined ($specified_fields->{$mfield});
377	# check fields here, maybe others dont want - change to use dontindex!!
378	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
379	next if ($mfield =~ /^gsdl/);
380
381	if (defined $self->{'fieldnamemap'}->{$mfield}) {
382	$shortname = $self->{'fieldnamemap'}->{$mfield};
383	} else {
384	$shortname = $self->create_shortname($mfield);
385	$self->{'fieldnamemap'}->{$mfield} = $shortname;
386	$self->{'fieldnamemap'}->{$shortname} = 1;
387	}
388	$self->{'allindexfields'}->{$mfield} = 1;
389	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
390	if (!defined $self->{'extraindexfields'}->{$mfield}) {
391	$self->{'extraindexfields'}->{$mfield} = 1;
392	}
393
394	}
395	# filter the text
396	$new_text = $self->filter_text ("metadata", $new_text);
397
398	$self->{'num_processed_bytes'} += length ($new_text);
399	$text .= "$new_text";
400
401
402	}
403
404	$text .= "$sectionendtag";
405	$section = $doc_obj->get_next_section($section);
406	} # while defined section
407	print $handle "$text\n$documentendtag";
408	#print STDERR "*********\n$text\n*************\n";
409
410	}
411
412	#chooses the first two letters or digits for the shortname
413	#now ignores non-letdig characters
414	sub create_shortname {
415	my $self = shift(@_);
416
417	my ($realname) = @_;
418	my @realnamelist = split(",", $realname);
419	map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
420	my ($singlename) = $realnamelist[0];
421
422	# try our predefined static mapping
423	my $name;
424	if (defined ($name = $static_indexfield_map{$singlename})) {
425	if (! defined $self->{'fieldnamemap'}->{$name}) {
426	# has this shortname already been used??
427	return $static_indexfield_map{$singlename};
428	}
429	}
430	# we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
431	$realname = join ("", @realnamelist);
432	#try the first two chars
433	my $shortname;
434	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
435	$shortname = "$1$2";
436	} else {
437	# there aren't two letdig's in the field - try arbitrary combinations
438	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
439	$shortname = "AB";
440	}
441	$shortname =~ tr/a-z/A-Z/;
442
443	#if already used, take the first and third letdigs and so on
444	my $count = 1;
445	while (defined $self->{'fieldnamemap'}->{$shortname} \|\| defined $static_indexfield_map{$shortname}) {
446	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
447	$shortname = "$1$3";
448	$count++;
449	$shortname =~ tr/a-z/A-Z/;
450
451	}
452	else {
453	#remove up to and incl the first letdig
454	$realname =~ s/^[^\w]*\w//;
455	$count = 0;
456	}
457	}
458
459	return $shortname;
460	}
461
462	1;
463

Note: See TracBrowser for help on using the repository browser.

Download in other formats: