source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 31861

Last change on this file since 31861 was 27328, checked in by kjdon, 11 years ago

changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
48# change this so a user can add their own ones in via a file or cfg
49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50#also add the level names (Doc, Sec, Para)
51our %static_indexfield_map = ('Title'=>'TI',
52 'TI'=>1,
53 'Subject'=>'SU',
54 'SU'=>1,
55 'Creator'=>'CR',
56 'CR'=>1,
57 'Organization'=>'ORG',
58 'ORG'=>1,
59 'Source'=>'SO',
60 'SO'=>1,
61 'Howto'=>'HT',
62 'HT'=>1,
63 'ItemTitle'=>'IT',
64 'IT'=>1,
65 'ProgNumber'=>'PN',
66 'PN'=>1,
67 'People'=>'PE',
68 'PE'=>1,
69 'Coverage'=>'CO',
70 'CO'=>1,
71 'allfields'=>'ZZ',
72 'ZZ'=>1,
73 'text'=>'TX',
74 'TX'=>1,
75 'AND'=>1,
76 'OR'=>1,
77 'NOT'=>1,
78 'NEAR'=>1,
79 'Doc'=>1,
80 'Sec'=>1,
81 'Para'=>1);
82
83
84sub new {
85 my $class = shift @_;
86 my $self = new basebuildproc (@_);
87
88 # use a different index specification to the default
89 $self->{'index'} = "text";
90
91 $self->{'dontindex'} = {};
92 $self->{'allindexfields'} = {}; # list of all actually indexed fields
93 $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
94 $self->{'fieldnamemap'} = {'allfields'=>'ZZ',
95 'ZZ'=>1,
96 'text'=>'TX',
97 'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
98 $self->{'strip_html'}=1;
99
100 return bless $self, $class;
101}
102
103sub set_levels {
104 my $self = shift (@_);
105 my ($levels) = @_;
106
107 $self->{'levels'} = $levels;
108}
109
110sub set_strip_html {
111 my $self = shift (@_);
112 my ($strip) = @_;
113 $self->{'strip_html'}=$strip;
114}
115
116#sub find_paragraphs {
117# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
118#}
119
120sub remove_gtlt {
121 my $self =shift(@_);
122 my ($text, $para) = @_;
123 $text =~s/[<>]//g;
124 return "$para$text$para";
125}
126
127sub process_tags {
128 my $self = shift(@_);
129 my ($text, $para) = @_;
130 if ($text =~ /^p\b/i) {
131 return $para;
132 }
133 return "";
134}
135
136sub preprocess_text {
137 my $self = shift (@_);
138 my ($text, $strip_html, $para) = @_;
139 # at this stage, we do not do paragraph tags unless have strip_html -
140 # it will result in a huge mess of non-xml
141 return unless $strip_html;
142
143 my $new_text = $text;
144
145 # if we have <pre> tags, we can have < > inside them, need to delete
146 # the <> before stripping tags
147 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
148
149 if ($para eq "") {
150 # just remove all tags
151 $new_text =~ s/<[^>]*>//gs;
152 } else {
153 # strip all tags except <p> tags which get turned into $para
154 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
155
156 }
157 return $new_text;
158}
159#this function strips the html tags from the doc if ($strip_html) and
160# if ($para) replaces <p> with <Paragraph> tags.
161# if both are false, the original text is returned
162#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
163#these tags
164sub preprocess_text_old_and_slow {
165 my $self = shift (@_);
166 my ($text, $strip_html, $para) = @_;
167 my ($outtext) = "";
168 if ($strip_html) {
169 while ($text =~ /<([^>]*)>/ && $text ne "") {
170
171 my $tag = $1;
172 $outtext .= $`." "; #add everything before the matched tag
173 $text = $'; #'everything after the matched tag
174 if ($para && $tag =~ /^\s*p\s/i) {
175 $outtext .= $para;
176 }
177 elsif ($tag =~ /^pre$/) { # a pre tag
178 $text =~ /<\/pre>/; # find the closing pre tag
179 my $tmp_text = $`; #everything before the closing pre tag
180 $text = $'; #'everything after the </pre>
181 $tmp_text =~ s/[<>]//g; # remove all < and >
182 $outtext.= $tmp_text . " ";
183 }
184 }
185
186 $outtext .= $text; # add any remaining text
187 return $outtext;
188 } #if strip_html
189
190 #if ($para) {
191 #$text =~ s/(<p\b)/$para$1/gi;
192 #return $text;
193 # }
194 return $text;
195}
196
197sub text {
198 my $self = shift (@_);
199 my ($doc_obj) = @_;
200 my $handle = $self->{'output_handle'};
201 my $outhandle = $self->{'outhandle'};
202
203 # only output this document if it is one to be indexed
204 return if ($doc_obj->get_doc_type() ne "indexed_doc");
205
206 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
207
208 # this is another document
209 $self->{'num_docs'} += 1;
210
211 # get the parameters for the output
212 # split on : just in case there is subcoll and lang stuff
213 my ($fields) = split (/:/, $self->{'index'});
214
215 # we always do text and index on Doc and Sec levels
216 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
217 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
218 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
219 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
220
221 my ($paratag) = "";
222
223 # paragraph tags will only be used for indexing (can't retrieve
224 # paragraphs), and can ony be used if we are stripping HTML tags
225 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
226 if ($self->{'strip_html'}) {
227 $paratag = "<". $level_map{'paragraph'} . ">";
228 } else {
229 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
230 }
231 }
232
233 my $doc_section = 0; # just for this document
234
235 my $text = $documenttag;
236
237 # get the text for this document
238 my $section = $doc_obj->get_top_section();
239
240 while (defined $section) {
241 # update a few statistics
242 $doc_section++;
243 $self->{'num_sections'} += 1;
244 $text .= "$sectiontag";
245
246 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
247 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
248 # we are not actually indexing anything for this document,
249 # but we want to keep the section numbers the same, so we just
250 # output section tags for each section (which is done above)
251 $text .= "$sectionendtag";
252 $section = $doc_obj->get_next_section($section);
253 next;
254 }
255
256 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
257
258 # has the user added a 'metadata' index?
259 my $all_metadata_specified = 0;
260 # which fields have already been indexed? (same as fields, but in a map)
261 my $specified_fields = {};
262 foreach my $field (split (/;/, $fields)) {
263 # only deal with this field if it doesn't start with top or
264 # this is the first section
265 my $real_field = $field;
266 next if (($real_field =~ s/^top//) && ($doc_section != 1));
267
268 my $new_text = "";
269
270 # we get allfields by default
271 next if ($real_field eq "allfields");
272
273 # metadata - output all metadata we know about except gsdl stuff
274 # each metadata is in a separate index field
275 if ($real_field eq "metadata") {
276 # we will process this later, so we are not reindexing metadata already indexed
277 $all_metadata_specified = 1;
278 next;
279 }
280
281 #individual metadata and or text specified - could be
282 # a comma separated list
283 $specified_fields->{$real_field} = 1;
284 my $shortname="";
285
286 if (defined $self->{'fieldnamemap'}->{$real_field}) {
287 $shortname = $self->{'fieldnamemap'}->{$real_field};
288 } else {
289 $shortname = $self->create_shortname($real_field);
290 $self->{'fieldnamemap'}->{$real_field} = $shortname;
291 $self->{'fieldnamemap'}->{$shortname} = 1;
292 }
293
294 my @metadata_list = (); # put any meta values in here
295 my $section_text = ""; # put any text in here
296 foreach my $submeta (split /,/, $real_field) {
297 if ($submeta eq "text") {
298 # no point in indexing text more than once
299 if ($section_text eq "") {
300 $section_text = $doc_obj->get_text($section);
301 if ($self->{'indexing_text'}) {
302 if ($paratag ne "") {
303 # we fiddle around with splitting text into paragraphs
304 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
305 }
306 else {
307 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
308 }
309 }
310 }
311 }
312 else {
313 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
314 # its a metadata element
315 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
316 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
317 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
318 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
319 }
320 }
321 push (@metadata_list, @section_metadata);
322 }
323 } # for each field in index
324
325
326 # now we add the text and/or the metadata into new_text
327 if ($section_text ne "" || scalar(@metadata_list)) {
328 if ($self->{'indexing_text'}) {
329 # only add tags in if indexing
330 $new_text .= "$paratag<$shortname>";
331 }
332 if ($section_text ne "") {
333 $new_text .= "$section_text ";
334 if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
335 $new_text .= "</$shortname>$paratag<$shortname>";
336 }
337 }
338 foreach my $item (@metadata_list) {
339 $new_text .= "$item ";
340 }
341 if ($self->{'indexing_text'}) {
342 # only add tags in if indexing
343 $new_text .= "</$shortname>";
344 $self->{'allindexfields'}->{$real_field} = 1;
345 }
346 }
347
348 # filter the text
349 $new_text = $self->filter_text ($field, $new_text);
350
351 $self->{'num_processed_bytes'} += length ($new_text);
352 $text .= "$new_text";
353 } # foreach field
354
355 if ($all_metadata_specified) {
356 my $new_text = "";
357 my $shortname = "";
358 my $metadata = $doc_obj->get_all_metadata ($section);
359 foreach my $pair (@$metadata) {
360 my ($mfield, $mvalue) = (@$pair);
361 # no value
362 next unless defined $mvalue && $mvalue ne "";
363 # we have already indexed this
364 next if defined ($specified_fields->{$mfield});
365 # check fields here, maybe others dont want - change to use dontindex!!
366 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
367 next if ($mfield =~ /^gsdl/);
368
369 if (defined $self->{'fieldnamemap'}->{$mfield}) {
370 $shortname = $self->{'fieldnamemap'}->{$mfield};
371 } else {
372 $shortname = $self->create_shortname($mfield);
373 $self->{'fieldnamemap'}->{$mfield} = $shortname;
374 $self->{'fieldnamemap'}->{$shortname} = 1;
375 }
376 $self->{'allindexfields'}->{$mfield} = 1;
377 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
378 if (!defined $self->{'extraindexfields'}->{$mfield}) {
379 $self->{'extraindexfields'}->{$mfield} = 1;
380 }
381
382 }
383 # filter the text
384 $new_text = $self->filter_text ("metadata", $new_text);
385
386 $self->{'num_processed_bytes'} += length ($new_text);
387 $text .= "$new_text";
388
389
390 }
391
392 $text .= "$sectionendtag";
393 $section = $doc_obj->get_next_section($section);
394 } # while defined section
395 print $handle "$text\n$documentendtag";
396 #print STDERR "***********\n$text\n***************\n";
397
398}
399
400#chooses the first two letters or digits for the shortname
401#now ignores non-letdig characters
402sub create_shortname {
403 my $self = shift(@_);
404
405 my ($realname) = @_;
406 my @realnamelist = split(",", $realname);
407 map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
408 my ($singlename) = $realnamelist[0];
409
410 # try our predefined static mapping
411 my $name;
412 if (defined ($name = $static_indexfield_map{$singlename})) {
413 if (! defined $self->{'fieldnamemap'}->{$name}) {
414 # has this shortname already been used??
415 return $static_indexfield_map{$singlename};
416 }
417 }
418 # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
419 $realname = join ("", @realnamelist);
420 #try the first two chars
421 my $shortname;
422 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
423 $shortname = "$1$2";
424 } else {
425 # there aren't two letdig's in the field - try arbitrary combinations
426 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
427 $shortname = "AB";
428 }
429 $shortname =~ tr/a-z/A-Z/;
430
431 #if already used, take the first and third letdigs and so on
432 my $count = 1;
433 while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
434 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
435 $shortname = "$1$3";
436 $count++;
437 $shortname =~ tr/a-z/A-Z/;
438
439 }
440 else {
441 #remove up to and incl the first letdig
442 $realname =~ s/^[^\w]*\w//;
443 $count = 0;
444 }
445 }
446
447 return $shortname;
448}
449
4501;
451
Note: See TracBrowser for help on using the repository browser.