source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 33302

Last change on this file since 33302 was 33302, checked in by ak19, 5 years ago
  1. Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
48# change this so a user can add their own ones in via a file or cfg
49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50#also add the level names (Doc, Sec, Para)
51our %static_indexfield_map = ('Title'=>'TI',
52 'TI'=>1,
53 'Subject'=>'SU',
54 'SU'=>1,
55 'Creator'=>'CR',
56 'CR'=>1,
57 'Organization'=>'ORG',
58 'ORG'=>1,
59 'Source'=>'SO',
60 'SO'=>1,
61 'Howto'=>'HT',
62 'HT'=>1,
63 'ItemTitle'=>'IT',
64 'IT'=>1,
65 'ProgNumber'=>'PN',
66 'PN'=>1,
67 'People'=>'PE',
68 'PE'=>1,
69 'Coverage'=>'CO',
70 'CO'=>1,
71 'allfields'=>'ZZ',
72 'ZZ'=>1,
73 'text'=>'TX',
74 'TX'=>1,
75 'GPSMapOverlayLabel'=> 'ML',
76 'ML'=>1,
77 'Coordinate'=>'CD',
78 'CD'=>1,
79 'CoordShort'=>'CS',
80 'CS'=>1,
81 'Latitude'=>'LT',
82 'LT'=>1,
83 'Longitude'=>'LO',
84 'LO'=>1,
85 'LatShort'=>'LA',
86 'LA'=>1,
87 'LngShort'=>'LN',
88 'LN'=>1,
89 'AND'=>1,
90 'OR'=>1,
91 'NOT'=>1,
92 'NEAR'=>1,
93 'Doc'=>1,
94 'Sec'=>1,
95 'Para'=>1);
96
97
98sub new {
99 my $class = shift @_;
100 my $self = new basebuildproc (@_);
101
102 # use a different index specification to the default
103 $self->{'index'} = "text";
104
105 $self->{'dontindex'} = {};
106 $self->{'allindexfields'} = {}; # list of all actually indexed fields
107 $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
108 $self->{'fieldnamemap'} = {'allfields'=>'ZZ',
109 'ZZ'=>1,
110 'text'=>'TX',
111 'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
112 $self->{'strip_html'}=1;
113
114 return bless $self, $class;
115}
116
117sub set_levels {
118 my $self = shift (@_);
119 my ($levels) = @_;
120
121 $self->{'levels'} = $levels;
122}
123
124sub set_strip_html {
125 my $self = shift (@_);
126 my ($strip) = @_;
127 $self->{'strip_html'}=$strip;
128}
129
130#sub find_paragraphs {
131# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
132#}
133
134sub remove_gtlt {
135 my $self =shift(@_);
136 my ($text, $para) = @_;
137 $text =~s/[<>]//g;
138 return "$para$text$para";
139}
140
141sub process_tags {
142 my $self = shift(@_);
143 my ($text, $para) = @_;
144 if ($text =~ /^p\b/i) {
145 return $para;
146 }
147 return "";
148}
149
150sub preprocess_text {
151 my $self = shift (@_);
152 my ($text, $strip_html, $para) = @_;
153 # at this stage, we do not do paragraph tags unless have strip_html -
154 # it will result in a huge mess of non-xml
155 return unless $strip_html;
156
157 my $new_text = $text;
158
159 # if we have <pre> tags, we can have < > inside them, need to delete
160 # the <> before stripping tags
161 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
162
163 if ($para eq "") {
164 # just remove all tags
165 $new_text =~ s/<[^>]*>//gs;
166 } else {
167 # strip all tags except <p> tags which get turned into $para
168 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
169
170 }
171 return $new_text;
172}
173#this function strips the html tags from the doc if ($strip_html) and
174# if ($para) replaces <p> with <Paragraph> tags.
175# if both are false, the original text is returned
176#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
177#these tags
178sub preprocess_text_old_and_slow {
179 my $self = shift (@_);
180 my ($text, $strip_html, $para) = @_;
181 my ($outtext) = "";
182 if ($strip_html) {
183 while ($text =~ /<([^>]*)>/ && $text ne "") {
184
185 my $tag = $1;
186 $outtext .= $`." "; #add everything before the matched tag
187 $text = $'; #'everything after the matched tag
188 if ($para && $tag =~ /^\s*p\s/i) {
189 $outtext .= $para;
190 }
191 elsif ($tag =~ /^pre$/) { # a pre tag
192 $text =~ /<\/pre>/; # find the closing pre tag
193 my $tmp_text = $`; #everything before the closing pre tag
194 $text = $'; #'everything after the </pre>
195 $tmp_text =~ s/[<>]//g; # remove all < and >
196 $outtext.= $tmp_text . " ";
197 }
198 }
199
200 $outtext .= $text; # add any remaining text
201 return $outtext;
202 } #if strip_html
203
204 #if ($para) {
205 #$text =~ s/(<p\b)/$para$1/gi;
206 #return $text;
207 # }
208 return $text;
209}
210
211sub text {
212 my $self = shift (@_);
213 my ($doc_obj) = @_;
214 my $handle = $self->{'output_handle'};
215 my $outhandle = $self->{'outhandle'};
216
217 # only output this document if it is one to be indexed
218 return if ($doc_obj->get_doc_type() ne "indexed_doc");
219
220 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
221
222 # this is another document
223 $self->{'num_docs'} += 1;
224
225 # get the parameters for the output
226 # split on : just in case there is subcoll and lang stuff
227 my ($fields) = split (/:/, $self->{'index'});
228
229 # we always do text and index on Doc and Sec levels
230 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
231 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
232 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
233 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
234
235 my ($paratag) = "";
236
237 # paragraph tags will only be used for indexing (can't retrieve
238 # paragraphs), and can ony be used if we are stripping HTML tags
239 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
240 if ($self->{'strip_html'}) {
241 $paratag = "<". $level_map{'paragraph'} . ">";
242 } else {
243 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
244 }
245 }
246
247 my $doc_section = 0; # just for this document
248
249 my $text = $documenttag;
250
251 # get the text for this document
252 my $section = $doc_obj->get_top_section();
253
254 while (defined $section) {
255 # update a few statistics
256 $doc_section++;
257 $self->{'num_sections'} += 1;
258 $text .= "$sectiontag";
259
260 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
261 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
262 # we are not actually indexing anything for this document,
263 # but we want to keep the section numbers the same, so we just
264 # output section tags for each section (which is done above)
265 $text .= "$sectionendtag";
266 $section = $doc_obj->get_next_section($section);
267 next;
268 }
269
270 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
271
272 # has the user added a 'metadata' index?
273 my $all_metadata_specified = 0;
274 # which fields have already been indexed? (same as fields, but in a map)
275 my $specified_fields = {};
276 foreach my $field (split (/;/, $fields)) {
277 # only deal with this field if it doesn't start with top or
278 # this is the first section
279 my $real_field = $field;
280 next if (($real_field =~ s/^top//) && ($doc_section != 1));
281
282 my $new_text = "";
283
284 # we get allfields by default
285 next if ($real_field eq "allfields");
286
287 # metadata - output all metadata we know about except gsdl stuff
288 # each metadata is in a separate index field
289 if ($real_field eq "metadata") {
290 # we will process this later, so we are not reindexing metadata already indexed
291 $all_metadata_specified = 1;
292 next;
293 }
294
295 #individual metadata and or text specified - could be
296 # a comma separated list
297 $specified_fields->{$real_field} = 1;
298 my $shortname="";
299
300 if (defined $self->{'fieldnamemap'}->{$real_field}) {
301 $shortname = $self->{'fieldnamemap'}->{$real_field};
302 } else {
303 $shortname = $self->create_shortname($real_field);
304 $self->{'fieldnamemap'}->{$real_field} = $shortname;
305 $self->{'fieldnamemap'}->{$shortname} = 1;
306 }
307
308 my @metadata_list = (); # put any meta values in here
309 my $section_text = ""; # put any text in here
310 foreach my $submeta (split /,/, $real_field) {
311 if ($submeta eq "text") {
312 # no point in indexing text more than once
313 if ($section_text eq "") {
314 $section_text = $doc_obj->get_text($section);
315 if ($self->{'indexing_text'}) {
316 if ($paratag ne "") {
317 # we fiddle around with splitting text into paragraphs
318 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
319 }
320 else {
321 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
322 }
323 }
324 }
325 }
326 else {
327 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
328 # its a metadata element
329 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
330 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
331 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
332 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
333 }
334 }
335 push (@metadata_list, @section_metadata);
336 }
337 } # for each field in index
338
339
340 # now we add the text and/or the metadata into new_text
341 if ($section_text ne "" || scalar(@metadata_list)) {
342 if ($self->{'indexing_text'}) {
343 # only add tags in if indexing
344 $new_text .= "$paratag<$shortname>";
345 }
346 if ($section_text ne "") {
347 $new_text .= "$section_text ";
348 if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
349 $new_text .= "</$shortname>$paratag<$shortname>";
350 }
351 }
352 foreach my $item (@metadata_list) {
353 $new_text .= "$item ";
354 }
355 if ($self->{'indexing_text'}) {
356 # only add tags in if indexing
357 $new_text .= "</$shortname>";
358 $self->{'allindexfields'}->{$real_field} = 1;
359 }
360 }
361
362 # filter the text
363 $new_text = $self->filter_text ($field, $new_text);
364
365 $self->{'num_processed_bytes'} += length ($new_text);
366 $text .= "$new_text";
367 } # foreach field
368
369 if ($all_metadata_specified) {
370 my $new_text = "";
371 my $shortname = "";
372 my $metadata = $doc_obj->get_all_metadata ($section);
373 foreach my $pair (@$metadata) {
374 my ($mfield, $mvalue) = (@$pair);
375 # no value
376 next unless defined $mvalue && $mvalue ne "";
377 # we have already indexed this
378 next if defined ($specified_fields->{$mfield});
379 # check fields here, maybe others dont want - change to use dontindex!!
380 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
381 next if ($mfield =~ /^gsdl/);
382
383 if (defined $self->{'fieldnamemap'}->{$mfield}) {
384 $shortname = $self->{'fieldnamemap'}->{$mfield};
385 } else {
386 $shortname = $self->create_shortname($mfield);
387 $self->{'fieldnamemap'}->{$mfield} = $shortname;
388 $self->{'fieldnamemap'}->{$shortname} = 1;
389 }
390 $self->{'allindexfields'}->{$mfield} = 1;
391 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
392 if (!defined $self->{'extraindexfields'}->{$mfield}) {
393 $self->{'extraindexfields'}->{$mfield} = 1;
394 }
395
396 }
397 # filter the text
398 $new_text = $self->filter_text ("metadata", $new_text);
399
400 $self->{'num_processed_bytes'} += length ($new_text);
401 $text .= "$new_text";
402
403
404 }
405
406 $text .= "$sectionendtag";
407 $section = $doc_obj->get_next_section($section);
408 } # while defined section
409 print $handle "$text\n$documentendtag";
410 #print STDERR "***********\n$text\n***************\n";
411
412}
413
414#chooses the first two letters or digits for the shortname
415#now ignores non-letdig characters
416sub create_shortname {
417 my $self = shift(@_);
418
419 my ($realname) = @_;
420 my @realnamelist = split(",", $realname);
421 map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
422 my ($singlename) = $realnamelist[0];
423
424 # try our predefined static mapping
425 my $name;
426 if (defined ($name = $static_indexfield_map{$singlename})) {
427 if (! defined $self->{'fieldnamemap'}->{$name}) {
428 # has this shortname already been used??
429 return $static_indexfield_map{$singlename};
430 }
431 }
432 # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
433 $realname = join ("", @realnamelist);
434 #try the first two chars
435 my $shortname;
436 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
437 $shortname = "$1$2";
438 } else {
439 # there aren't two letdig's in the field - try arbitrary combinations
440 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
441 $shortname = "AB";
442 }
443 $shortname =~ tr/a-z/A-Z/;
444
445 #if already used, take the first and third letdigs and so on
446 my $count = 1;
447 while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
448 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
449 $shortname = "$1$3";
450 $count++;
451 $shortname =~ tr/a-z/A-Z/;
452
453 }
454 else {
455 #remove up to and incl the first letdig
456 $realname =~ s/^[^\w]*\w//;
457 $count = 0;
458 }
459 }
460
461 return $shortname;
462}
463
4641;
465
Note: See TracBrowser for help on using the repository browser.