Context Navigation

source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 33302

Last change on this file since 33302 was 33302, checked in by ak19, 5 years ago
Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	use basebuildproc;
33	use cnseg;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	BEGIN {
40	@mgppbuildproc::ISA = ('basebuildproc');
41	}
42
43	#this must be the same as in mgppbuilder
44	our %level_map = ('document'=>'Doc',
45	'section'=>'Sec',
46	'paragraph'=>'Para');
47
48	# change this so a user can add their own ones in via a file or cfg
49	#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50	#also add the level names (Doc, Sec, Para)
51	our %static_indexfield_map = ('Title'=>'TI',
52	'TI'=>1,
53	'Subject'=>'SU',
54	'SU'=>1,
55	'Creator'=>'CR',
56	'CR'=>1,
57	'Organization'=>'ORG',
58	'ORG'=>1,
59	'Source'=>'SO',
60	'SO'=>1,
61	'Howto'=>'HT',
62	'HT'=>1,
63	'ItemTitle'=>'IT',
64	'IT'=>1,
65	'ProgNumber'=>'PN',
66	'PN'=>1,
67	'People'=>'PE',
68	'PE'=>1,
69	'Coverage'=>'CO',
70	'CO'=>1,
71	'allfields'=>'ZZ',
72	'ZZ'=>1,
73	'text'=>'TX',
74	'TX'=>1,
75	'GPSMapOverlayLabel'=> 'ML',
76	'ML'=>1,
77	'Coordinate'=>'CD',
78	'CD'=>1,
79	'CoordShort'=>'CS',
80	'CS'=>1,
81	'Latitude'=>'LT',
82	'LT'=>1,
83	'Longitude'=>'LO',
84	'LO'=>1,
85	'LatShort'=>'LA',
86	'LA'=>1,
87	'LngShort'=>'LN',
88	'LN'=>1,
89	'AND'=>1,
90	'OR'=>1,
91	'NOT'=>1,
92	'NEAR'=>1,
93	'Doc'=>1,
94	'Sec'=>1,
95	'Para'=>1);
96
97
98	sub new {
99	my $class = shift @_;
100	my $self = new basebuildproc (@_);
101
102	# use a different index specification to the default
103	$self->{'index'} = "text";
104
105	$self->{'dontindex'} = {};
106	$self->{'allindexfields'} = {}; # list of all actually indexed fields
107	$self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
108	$self->{'fieldnamemap'} = {'allfields'=>'ZZ',
109	'ZZ'=>1,
110	'text'=>'TX',
111	'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
112	$self->{'strip_html'}=1;
113
114	return bless $self, $class;
115	}
116
117	sub set_levels {
118	my $self = shift (@_);
119	my ($levels) = @_;
120
121	$self->{'levels'} = $levels;
122	}
123
124	sub set_strip_html {
125	my $self = shift (@_);
126	my ($strip) = @_;
127	$self->{'strip_html'}=$strip;
128	}
129
130	#sub find_paragraphs {
131	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
132	#}
133
134	sub remove_gtlt {
135	my $self =shift(@_);
136	my ($text, $para) = @_;
137	$text =~s/[<>]//g;
138	return "$para$text$para";
139	}
140
141	sub process_tags {
142	my $self = shift(@_);
143	my ($text, $para) = @_;
144	if ($text =~ /^p\b/i) {
145	return $para;
146	}
147	return "";
148	}
149
150	sub preprocess_text {
151	my $self = shift (@_);
152	my ($text, $strip_html, $para) = @_;
153	# at this stage, we do not do paragraph tags unless have strip_html -
154	# it will result in a huge mess of non-xml
155	return unless $strip_html;
156
157	my $new_text = $text;
158
159	# if we have <pre> tags, we can have < > inside them, need to delete
160	# the <> before stripping tags
161	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
162
163	if ($para eq "") {
164	# just remove all tags
165	$new_text =~ s/<[^>]*>//gs;
166	} else {
167	# strip all tags except <p> tags which get turned into $para
168	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
169
170	}
171	return $new_text;
172	}
173	#this function strips the html tags from the doc if ($strip_html) and
174	# if ($para) replaces <p> with <Paragraph> tags.
175	# if both are false, the original text is returned
176	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
177	#these tags
178	sub preprocess_text_old_and_slow {
179	my $self = shift (@_);
180	my ($text, $strip_html, $para) = @_;
181	my ($outtext) = "";
182	if ($strip_html) {
183	while ($text =~ /<([^>]*)>/ && $text ne "") {
184
185	my $tag = $1;
186	$outtext .= $`." "; #add everything before the matched tag
187	$text = $'; #'everything after the matched tag
188	if ($para && $tag =~ /^\s*p\s/i) {
189	$outtext .= $para;
190	}
191	elsif ($tag =~ /^pre$/) { # a pre tag
192	$text =~ /<\/pre>/; # find the closing pre tag
193	my $tmp_text = $`; #everything before the closing pre tag
194	$text = $'; #'everything after the </pre>
195	$tmp_text =~ s/[<>]//g; # remove all < and >
196	$outtext.= $tmp_text . " ";
197	}
198	}
199
200	$outtext .= $text; # add any remaining text
201	return $outtext;
202	} #if strip_html
203
204	#if ($para) {
205	#$text =~ s/(<p\b)/$para$1/gi;
206	#return $text;
207	# }
208	return $text;
209	}
210
211	sub text {
212	my $self = shift (@_);
213	my ($doc_obj) = @_;
214	my $handle = $self->{'output_handle'};
215	my $outhandle = $self->{'outhandle'};
216
217	# only output this document if it is one to be indexed
218	return if ($doc_obj->get_doc_type() ne "indexed_doc");
219
220	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
221
222	# this is another document
223	$self->{'num_docs'} += 1;
224
225	# get the parameters for the output
226	# split on : just in case there is subcoll and lang stuff
227	my ($fields) = split (/:/, $self->{'index'});
228
229	# we always do text and index on Doc and Sec levels
230	my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
231	my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
232	my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
233	my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
234
235	my ($paratag) = "";
236
237	# paragraph tags will only be used for indexing (can't retrieve
238	# paragraphs), and can ony be used if we are stripping HTML tags
239	if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
240	if ($self->{'strip_html'}) {
241	$paratag = "<". $level_map{'paragraph'} . ">";
242	} else {
243	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
244	}
245	}
246
247	my $doc_section = 0; # just for this document
248
249	my $text = $documenttag;
250
251	# get the text for this document
252	my $section = $doc_obj->get_top_section();
253
254	while (defined $section) {
255	# update a few statistics
256	$doc_section++;
257	$self->{'num_sections'} += 1;
258	$text .= "$sectiontag";
259
260	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
261	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
262	# we are not actually indexing anything for this document,
263	# but we want to keep the section numbers the same, so we just
264	# output section tags for each section (which is done above)
265	$text .= "$sectionendtag";
266	$section = $doc_obj->get_next_section($section);
267	next;
268	}
269
270	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
271
272	# has the user added a 'metadata' index?
273	my $all_metadata_specified = 0;
274	# which fields have already been indexed? (same as fields, but in a map)
275	my $specified_fields = {};
276	foreach my $field (split (/;/, $fields)) {
277	# only deal with this field if it doesn't start with top or
278	# this is the first section
279	my $real_field = $field;
280	next if (($real_field =~ s/^top//) && ($doc_section != 1));
281
282	my $new_text = "";
283
284	# we get allfields by default
285	next if ($real_field eq "allfields");
286
287	# metadata - output all metadata we know about except gsdl stuff
288	# each metadata is in a separate index field
289	if ($real_field eq "metadata") {
290	# we will process this later, so we are not reindexing metadata already indexed
291	$all_metadata_specified = 1;
292	next;
293	}
294
295	#individual metadata and or text specified - could be
296	# a comma separated list
297	$specified_fields->{$real_field} = 1;
298	my $shortname="";
299
300	if (defined $self->{'fieldnamemap'}->{$real_field}) {
301	$shortname = $self->{'fieldnamemap'}->{$real_field};
302	} else {
303	$shortname = $self->create_shortname($real_field);
304	$self->{'fieldnamemap'}->{$real_field} = $shortname;
305	$self->{'fieldnamemap'}->{$shortname} = 1;
306	}
307
308	my @metadata_list = (); # put any meta values in here
309	my $section_text = ""; # put any text in here
310	foreach my $submeta (split /,/, $real_field) {
311	if ($submeta eq "text") {
312	# no point in indexing text more than once
313	if ($section_text eq "") {
314	$section_text = $doc_obj->get_text($section);
315	if ($self->{'indexing_text'}) {
316	if ($paratag ne "") {
317	# we fiddle around with splitting text into paragraphs
318	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
319	}
320	else {
321	$section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
322	}
323	}
324	}
325	}
326	else {
327	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
328	# its a metadata element
329	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
330	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
331	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
332	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
333	}
334	}
335	push (@metadata_list, @section_metadata);
336	}
337	} # for each field in index
338
339
340	# now we add the text and/or the metadata into new_text
341	if ($section_text ne "" \|\| scalar(@metadata_list)) {
342	if ($self->{'indexing_text'}) {
343	# only add tags in if indexing
344	$new_text .= "$paratag<$shortname>";
345	}
346	if ($section_text ne "") {
347	$new_text .= "$section_text ";
348	if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
349	$new_text .= "</$shortname>$paratag<$shortname>";
350	}
351	}
352	foreach my $item (@metadata_list) {
353	$new_text .= "$item ";
354	}
355	if ($self->{'indexing_text'}) {
356	# only add tags in if indexing
357	$new_text .= "</$shortname>";
358	$self->{'allindexfields'}->{$real_field} = 1;
359	}
360	}
361
362	# filter the text
363	$new_text = $self->filter_text ($field, $new_text);
364
365	$self->{'num_processed_bytes'} += length ($new_text);
366	$text .= "$new_text";
367	} # foreach field
368
369	if ($all_metadata_specified) {
370	my $new_text = "";
371	my $shortname = "";
372	my $metadata = $doc_obj->get_all_metadata ($section);
373	foreach my $pair (@$metadata) {
374	my ($mfield, $mvalue) = (@$pair);
375	# no value
376	next unless defined $mvalue && $mvalue ne "";
377	# we have already indexed this
378	next if defined ($specified_fields->{$mfield});
379	# check fields here, maybe others dont want - change to use dontindex!!
380	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
381	next if ($mfield =~ /^gsdl/);
382
383	if (defined $self->{'fieldnamemap'}->{$mfield}) {
384	$shortname = $self->{'fieldnamemap'}->{$mfield};
385	} else {
386	$shortname = $self->create_shortname($mfield);
387	$self->{'fieldnamemap'}->{$mfield} = $shortname;
388	$self->{'fieldnamemap'}->{$shortname} = 1;
389	}
390	$self->{'allindexfields'}->{$mfield} = 1;
391	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
392	if (!defined $self->{'extraindexfields'}->{$mfield}) {
393	$self->{'extraindexfields'}->{$mfield} = 1;
394	}
395
396	}
397	# filter the text
398	$new_text = $self->filter_text ("metadata", $new_text);
399
400	$self->{'num_processed_bytes'} += length ($new_text);
401	$text .= "$new_text";
402
403
404	}
405
406	$text .= "$sectionendtag";
407	$section = $doc_obj->get_next_section($section);
408	} # while defined section
409	print $handle "$text\n$documentendtag";
410	#print STDERR "*********\n$text\n*************\n";
411
412	}
413
414	#chooses the first two letters or digits for the shortname
415	#now ignores non-letdig characters
416	sub create_shortname {
417	my $self = shift(@_);
418
419	my ($realname) = @_;
420	my @realnamelist = split(",", $realname);
421	map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
422	my ($singlename) = $realnamelist[0];
423
424	# try our predefined static mapping
425	my $name;
426	if (defined ($name = $static_indexfield_map{$singlename})) {
427	if (! defined $self->{'fieldnamemap'}->{$name}) {
428	# has this shortname already been used??
429	return $static_indexfield_map{$singlename};
430	}
431	}
432	# we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
433	$realname = join ("", @realnamelist);
434	#try the first two chars
435	my $shortname;
436	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
437	$shortname = "$1$2";
438	} else {
439	# there aren't two letdig's in the field - try arbitrary combinations
440	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
441	$shortname = "AB";
442	}
443	$shortname =~ tr/a-z/A-Z/;
444
445	#if already used, take the first and third letdigs and so on
446	my $count = 1;
447	while (defined $self->{'fieldnamemap'}->{$shortname} \|\| defined $static_indexfield_map{$shortname}) {
448	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
449	$shortname = "$1$3";
450	$count++;
451	$shortname =~ tr/a-z/A-Z/;
452
453	}
454	else {
455	#remove up to and incl the first letdig
456	$realname =~ s/^[^\w]*\w//;
457	$count = 0;
458	}
459	}
460
461	return $shortname;
462	}
463
464	1;
465

Note: See TracBrowser for help on using the repository browser.

Download in other formats: