source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 33235

Last change on this file since 33235 was 33144, checked in by wy59, 5 years ago

Related to commit 33128 where we claimed we had reserved CD, CS index short names for Coordinate and CoordShort in Greenstone perl code. We hadn't yet because we couldn't see Latitude/LatShort and Longitude/LngShort reserved yet and weren't sure if we needed to add it in. With Dr Bainbridge's confirmation that we were on the right track, we've now reserved all of these index short names (CD,CS,LT,LO,LA,LN).

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
48# change this so a user can add their own ones in via a file or cfg
49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50#also add the level names (Doc, Sec, Para)
51our %static_indexfield_map = ('Title'=>'TI',
52 'TI'=>1,
53 'Subject'=>'SU',
54 'SU'=>1,
55 'Creator'=>'CR',
56 'CR'=>1,
57 'Organization'=>'ORG',
58 'ORG'=>1,
59 'Source'=>'SO',
60 'SO'=>1,
61 'Howto'=>'HT',
62 'HT'=>1,
63 'ItemTitle'=>'IT',
64 'IT'=>1,
65 'ProgNumber'=>'PN',
66 'PN'=>1,
67 'People'=>'PE',
68 'PE'=>1,
69 'Coverage'=>'CO',
70 'CO'=>1,
71 'allfields'=>'ZZ',
72 'ZZ'=>1,
73 'text'=>'TX',
74 'TX'=>1,
75 'Coordinate'=>'CD',
76 'CD'=>1,
77 'CoordShort'=>'CS',
78 'CS'=>1,
79 'Latitude'=>'LT',
80 'LT'=>1,
81 'Longitude'=>'LO',
82 'LO'=>1,
83 'LatShort'=>'LA',
84 'LA'=>1,
85 'LngShort'=>'LN',
86 'LN'=>1,
87 'AND'=>1,
88 'OR'=>1,
89 'NOT'=>1,
90 'NEAR'=>1,
91 'Doc'=>1,
92 'Sec'=>1,
93 'Para'=>1);
94
95
96sub new {
97 my $class = shift @_;
98 my $self = new basebuildproc (@_);
99
100 # use a different index specification to the default
101 $self->{'index'} = "text";
102
103 $self->{'dontindex'} = {};
104 $self->{'allindexfields'} = {}; # list of all actually indexed fields
105 $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
106 $self->{'fieldnamemap'} = {'allfields'=>'ZZ',
107 'ZZ'=>1,
108 'text'=>'TX',
109 'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
110 $self->{'strip_html'}=1;
111
112 return bless $self, $class;
113}
114
115sub set_levels {
116 my $self = shift (@_);
117 my ($levels) = @_;
118
119 $self->{'levels'} = $levels;
120}
121
122sub set_strip_html {
123 my $self = shift (@_);
124 my ($strip) = @_;
125 $self->{'strip_html'}=$strip;
126}
127
128#sub find_paragraphs {
129# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
130#}
131
132sub remove_gtlt {
133 my $self =shift(@_);
134 my ($text, $para) = @_;
135 $text =~s/[<>]//g;
136 return "$para$text$para";
137}
138
139sub process_tags {
140 my $self = shift(@_);
141 my ($text, $para) = @_;
142 if ($text =~ /^p\b/i) {
143 return $para;
144 }
145 return "";
146}
147
148sub preprocess_text {
149 my $self = shift (@_);
150 my ($text, $strip_html, $para) = @_;
151 # at this stage, we do not do paragraph tags unless have strip_html -
152 # it will result in a huge mess of non-xml
153 return unless $strip_html;
154
155 my $new_text = $text;
156
157 # if we have <pre> tags, we can have < > inside them, need to delete
158 # the <> before stripping tags
159 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
160
161 if ($para eq "") {
162 # just remove all tags
163 $new_text =~ s/<[^>]*>//gs;
164 } else {
165 # strip all tags except <p> tags which get turned into $para
166 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
167
168 }
169 return $new_text;
170}
171#this function strips the html tags from the doc if ($strip_html) and
172# if ($para) replaces <p> with <Paragraph> tags.
173# if both are false, the original text is returned
174#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
175#these tags
176sub preprocess_text_old_and_slow {
177 my $self = shift (@_);
178 my ($text, $strip_html, $para) = @_;
179 my ($outtext) = "";
180 if ($strip_html) {
181 while ($text =~ /<([^>]*)>/ && $text ne "") {
182
183 my $tag = $1;
184 $outtext .= $`." "; #add everything before the matched tag
185 $text = $'; #'everything after the matched tag
186 if ($para && $tag =~ /^\s*p\s/i) {
187 $outtext .= $para;
188 }
189 elsif ($tag =~ /^pre$/) { # a pre tag
190 $text =~ /<\/pre>/; # find the closing pre tag
191 my $tmp_text = $`; #everything before the closing pre tag
192 $text = $'; #'everything after the </pre>
193 $tmp_text =~ s/[<>]//g; # remove all < and >
194 $outtext.= $tmp_text . " ";
195 }
196 }
197
198 $outtext .= $text; # add any remaining text
199 return $outtext;
200 } #if strip_html
201
202 #if ($para) {
203 #$text =~ s/(<p\b)/$para$1/gi;
204 #return $text;
205 # }
206 return $text;
207}
208
209sub text {
210 my $self = shift (@_);
211 my ($doc_obj) = @_;
212 my $handle = $self->{'output_handle'};
213 my $outhandle = $self->{'outhandle'};
214
215 # only output this document if it is one to be indexed
216 return if ($doc_obj->get_doc_type() ne "indexed_doc");
217
218 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
219
220 # this is another document
221 $self->{'num_docs'} += 1;
222
223 # get the parameters for the output
224 # split on : just in case there is subcoll and lang stuff
225 my ($fields) = split (/:/, $self->{'index'});
226
227 # we always do text and index on Doc and Sec levels
228 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
229 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
230 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
231 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
232
233 my ($paratag) = "";
234
235 # paragraph tags will only be used for indexing (can't retrieve
236 # paragraphs), and can ony be used if we are stripping HTML tags
237 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
238 if ($self->{'strip_html'}) {
239 $paratag = "<". $level_map{'paragraph'} . ">";
240 } else {
241 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
242 }
243 }
244
245 my $doc_section = 0; # just for this document
246
247 my $text = $documenttag;
248
249 # get the text for this document
250 my $section = $doc_obj->get_top_section();
251
252 while (defined $section) {
253 # update a few statistics
254 $doc_section++;
255 $self->{'num_sections'} += 1;
256 $text .= "$sectiontag";
257
258 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
259 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
260 # we are not actually indexing anything for this document,
261 # but we want to keep the section numbers the same, so we just
262 # output section tags for each section (which is done above)
263 $text .= "$sectionendtag";
264 $section = $doc_obj->get_next_section($section);
265 next;
266 }
267
268 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
269
270 # has the user added a 'metadata' index?
271 my $all_metadata_specified = 0;
272 # which fields have already been indexed? (same as fields, but in a map)
273 my $specified_fields = {};
274 foreach my $field (split (/;/, $fields)) {
275 # only deal with this field if it doesn't start with top or
276 # this is the first section
277 my $real_field = $field;
278 next if (($real_field =~ s/^top//) && ($doc_section != 1));
279
280 my $new_text = "";
281
282 # we get allfields by default
283 next if ($real_field eq "allfields");
284
285 # metadata - output all metadata we know about except gsdl stuff
286 # each metadata is in a separate index field
287 if ($real_field eq "metadata") {
288 # we will process this later, so we are not reindexing metadata already indexed
289 $all_metadata_specified = 1;
290 next;
291 }
292
293 #individual metadata and or text specified - could be
294 # a comma separated list
295 $specified_fields->{$real_field} = 1;
296 my $shortname="";
297
298 if (defined $self->{'fieldnamemap'}->{$real_field}) {
299 $shortname = $self->{'fieldnamemap'}->{$real_field};
300 } else {
301 $shortname = $self->create_shortname($real_field);
302 $self->{'fieldnamemap'}->{$real_field} = $shortname;
303 $self->{'fieldnamemap'}->{$shortname} = 1;
304 }
305
306 my @metadata_list = (); # put any meta values in here
307 my $section_text = ""; # put any text in here
308 foreach my $submeta (split /,/, $real_field) {
309 if ($submeta eq "text") {
310 # no point in indexing text more than once
311 if ($section_text eq "") {
312 $section_text = $doc_obj->get_text($section);
313 if ($self->{'indexing_text'}) {
314 if ($paratag ne "") {
315 # we fiddle around with splitting text into paragraphs
316 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
317 }
318 else {
319 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
320 }
321 }
322 }
323 }
324 else {
325 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
326 # its a metadata element
327 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
328 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
329 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
330 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
331 }
332 }
333 push (@metadata_list, @section_metadata);
334 }
335 } # for each field in index
336
337
338 # now we add the text and/or the metadata into new_text
339 if ($section_text ne "" || scalar(@metadata_list)) {
340 if ($self->{'indexing_text'}) {
341 # only add tags in if indexing
342 $new_text .= "$paratag<$shortname>";
343 }
344 if ($section_text ne "") {
345 $new_text .= "$section_text ";
346 if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
347 $new_text .= "</$shortname>$paratag<$shortname>";
348 }
349 }
350 foreach my $item (@metadata_list) {
351 $new_text .= "$item ";
352 }
353 if ($self->{'indexing_text'}) {
354 # only add tags in if indexing
355 $new_text .= "</$shortname>";
356 $self->{'allindexfields'}->{$real_field} = 1;
357 }
358 }
359
360 # filter the text
361 $new_text = $self->filter_text ($field, $new_text);
362
363 $self->{'num_processed_bytes'} += length ($new_text);
364 $text .= "$new_text";
365 } # foreach field
366
367 if ($all_metadata_specified) {
368 my $new_text = "";
369 my $shortname = "";
370 my $metadata = $doc_obj->get_all_metadata ($section);
371 foreach my $pair (@$metadata) {
372 my ($mfield, $mvalue) = (@$pair);
373 # no value
374 next unless defined $mvalue && $mvalue ne "";
375 # we have already indexed this
376 next if defined ($specified_fields->{$mfield});
377 # check fields here, maybe others dont want - change to use dontindex!!
378 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
379 next if ($mfield =~ /^gsdl/);
380
381 if (defined $self->{'fieldnamemap'}->{$mfield}) {
382 $shortname = $self->{'fieldnamemap'}->{$mfield};
383 } else {
384 $shortname = $self->create_shortname($mfield);
385 $self->{'fieldnamemap'}->{$mfield} = $shortname;
386 $self->{'fieldnamemap'}->{$shortname} = 1;
387 }
388 $self->{'allindexfields'}->{$mfield} = 1;
389 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
390 if (!defined $self->{'extraindexfields'}->{$mfield}) {
391 $self->{'extraindexfields'}->{$mfield} = 1;
392 }
393
394 }
395 # filter the text
396 $new_text = $self->filter_text ("metadata", $new_text);
397
398 $self->{'num_processed_bytes'} += length ($new_text);
399 $text .= "$new_text";
400
401
402 }
403
404 $text .= "$sectionendtag";
405 $section = $doc_obj->get_next_section($section);
406 } # while defined section
407 print $handle "$text\n$documentendtag";
408 #print STDERR "***********\n$text\n***************\n";
409
410}
411
412#chooses the first two letters or digits for the shortname
413#now ignores non-letdig characters
414sub create_shortname {
415 my $self = shift(@_);
416
417 my ($realname) = @_;
418 my @realnamelist = split(",", $realname);
419 map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
420 my ($singlename) = $realnamelist[0];
421
422 # try our predefined static mapping
423 my $name;
424 if (defined ($name = $static_indexfield_map{$singlename})) {
425 if (! defined $self->{'fieldnamemap'}->{$name}) {
426 # has this shortname already been used??
427 return $static_indexfield_map{$singlename};
428 }
429 }
430 # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
431 $realname = join ("", @realnamelist);
432 #try the first two chars
433 my $shortname;
434 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
435 $shortname = "$1$2";
436 } else {
437 # there aren't two letdig's in the field - try arbitrary combinations
438 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
439 $shortname = "AB";
440 }
441 $shortname =~ tr/a-z/A-Z/;
442
443 #if already used, take the first and third letdigs and so on
444 my $count = 1;
445 while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
446 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
447 $shortname = "$1$3";
448 $count++;
449 $shortname =~ tr/a-z/A-Z/;
450
451 }
452 else {
453 #remove up to and incl the first letdig
454 $realname =~ s/^[^\w]*\w//;
455 $count = 0;
456 }
457 }
458
459 return $shortname;
460}
461
4621;
463
Note: See TracBrowser for help on using the repository browser.