source: main/trunk/greenstone2/perllib/mgppbuildproc.pm@ 33369

Last change on this file since 33369 was 33369, checked in by kjdon, 5 years ago

instead of create_shortname, now have get_or_create_shortname. this does the working of looking in fieldnamemap to see if a shortname has already been defined, and saving the new shortnames into the map. get_or_create_sortfield_shortname does the same thing with sortfieldnamemap

  • Property svn:keywords set to Author Date Id Revision
File size: 14.4 KB
RevLine 
[14912]1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
[17110]33use cnseg;
34
[14912]35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40 @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45 'section'=>'Sec',
46 'paragraph'=>'Para');
47
[17564]48# change this so a user can add their own ones in via a file or cfg
49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
50#also add the level names (Doc, Sec, Para)
51our %static_indexfield_map = ('Title'=>'TI',
52 'TI'=>1,
53 'Subject'=>'SU',
54 'SU'=>1,
55 'Creator'=>'CR',
56 'CR'=>1,
57 'Organization'=>'ORG',
58 'ORG'=>1,
59 'Source'=>'SO',
60 'SO'=>1,
61 'Howto'=>'HT',
62 'HT'=>1,
63 'ItemTitle'=>'IT',
64 'IT'=>1,
65 'ProgNumber'=>'PN',
66 'PN'=>1,
67 'People'=>'PE',
68 'PE'=>1,
69 'Coverage'=>'CO',
70 'CO'=>1,
71 'allfields'=>'ZZ',
72 'ZZ'=>1,
73 'text'=>'TX',
74 'TX'=>1,
[33302]75 'GPSMapOverlayLabel'=> 'ML',
76 'ML'=>1,
[33144]77 'Coordinate'=>'CD',
78 'CD'=>1,
79 'CoordShort'=>'CS',
80 'CS'=>1,
81 'Latitude'=>'LT',
82 'LT'=>1,
83 'Longitude'=>'LO',
84 'LO'=>1,
85 'LatShort'=>'LA',
86 'LA'=>1,
87 'LngShort'=>'LN',
88 'LN'=>1,
[17564]89 'AND'=>1,
90 'OR'=>1,
91 'NOT'=>1,
92 'NEAR'=>1,
93 'Doc'=>1,
94 'Sec'=>1,
95 'Para'=>1);
96
97
[14912]98sub new {
99 my $class = shift @_;
100 my $self = new basebuildproc (@_);
101
102 # use a different index specification to the default
103 $self->{'index'} = "text";
104
105 $self->{'dontindex'} = {};
[27328]106 $self->{'allindexfields'} = {}; # list of all actually indexed fields
107 $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
108 $self->{'fieldnamemap'} = {'allfields'=>'ZZ',
109 'ZZ'=>1,
110 'text'=>'TX',
111 'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
[14912]112 $self->{'strip_html'}=1;
113
114 return bless $self, $class;
115}
116
117sub set_levels {
118 my $self = shift (@_);
119 my ($levels) = @_;
120
121 $self->{'levels'} = $levels;
122}
123
124sub set_strip_html {
125 my $self = shift (@_);
126 my ($strip) = @_;
127 $self->{'strip_html'}=$strip;
128}
129
130#sub find_paragraphs {
131# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
132#}
133
134sub remove_gtlt {
135 my $self =shift(@_);
136 my ($text, $para) = @_;
137 $text =~s/[<>]//g;
138 return "$para$text$para";
139}
140
141sub process_tags {
142 my $self = shift(@_);
143 my ($text, $para) = @_;
144 if ($text =~ /^p\b/i) {
145 return $para;
146 }
147 return "";
148}
149
150sub preprocess_text {
151 my $self = shift (@_);
152 my ($text, $strip_html, $para) = @_;
153 # at this stage, we do not do paragraph tags unless have strip_html -
154 # it will result in a huge mess of non-xml
155 return unless $strip_html;
156
157 my $new_text = $text;
158
159 # if we have <pre> tags, we can have < > inside them, need to delete
160 # the <> before stripping tags
161 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
162
163 if ($para eq "") {
164 # just remove all tags
165 $new_text =~ s/<[^>]*>//gs;
166 } else {
167 # strip all tags except <p> tags which get turned into $para
168 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
169
170 }
171 return $new_text;
172}
173#this function strips the html tags from the doc if ($strip_html) and
174# if ($para) replaces <p> with <Paragraph> tags.
175# if both are false, the original text is returned
176#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
177#these tags
178sub preprocess_text_old_and_slow {
179 my $self = shift (@_);
180 my ($text, $strip_html, $para) = @_;
181 my ($outtext) = "";
182 if ($strip_html) {
183 while ($text =~ /<([^>]*)>/ && $text ne "") {
184
185 my $tag = $1;
186 $outtext .= $`." "; #add everything before the matched tag
187 $text = $'; #'everything after the matched tag
188 if ($para && $tag =~ /^\s*p\s/i) {
189 $outtext .= $para;
190 }
191 elsif ($tag =~ /^pre$/) { # a pre tag
192 $text =~ /<\/pre>/; # find the closing pre tag
193 my $tmp_text = $`; #everything before the closing pre tag
194 $text = $'; #'everything after the </pre>
195 $tmp_text =~ s/[<>]//g; # remove all < and >
196 $outtext.= $tmp_text . " ";
197 }
198 }
199
200 $outtext .= $text; # add any remaining text
201 return $outtext;
202 } #if strip_html
203
204 #if ($para) {
205 #$text =~ s/(<p\b)/$para$1/gi;
206 #return $text;
207 # }
208 return $text;
209}
210
211sub text {
212 my $self = shift (@_);
213 my ($doc_obj) = @_;
214 my $handle = $self->{'output_handle'};
215 my $outhandle = $self->{'outhandle'};
216
217 # only output this document if it is one to be indexed
218 return if ($doc_obj->get_doc_type() ne "indexed_doc");
219
220 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
221
222 # this is another document
223 $self->{'num_docs'} += 1;
224
225 # get the parameters for the output
226 # split on : just in case there is subcoll and lang stuff
227 my ($fields) = split (/:/, $self->{'index'});
228
229 # we always do text and index on Doc and Sec levels
230 my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
231 my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
232 my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
233 my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
234
235 my ($paratag) = "";
236
237 # paragraph tags will only be used for indexing (can't retrieve
238 # paragraphs), and can ony be used if we are stripping HTML tags
239 if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
240 if ($self->{'strip_html'}) {
241 $paratag = "<". $level_map{'paragraph'} . ">";
242 } else {
243 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
244 }
245 }
246
247 my $doc_section = 0; # just for this document
248
249 my $text = $documenttag;
250
251 # get the text for this document
252 my $section = $doc_obj->get_top_section();
253
254 while (defined $section) {
255 # update a few statistics
256 $doc_section++;
257 $self->{'num_sections'} += 1;
258 $text .= "$sectiontag";
259
260 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
261 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
262 # we are not actually indexing anything for this document,
263 # but we want to keep the section numbers the same, so we just
264 # output section tags for each section (which is done above)
265 $text .= "$sectionendtag";
266 $section = $doc_obj->get_next_section($section);
267 next;
268 }
269
270 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[17567]271
272 # has the user added a 'metadata' index?
273 my $all_metadata_specified = 0;
274 # which fields have already been indexed? (same as fields, but in a map)
275 my $specified_fields = {};
[14912]276 foreach my $field (split (/;/, $fields)) {
277 # only deal with this field if it doesn't start with top or
278 # this is the first section
279 my $real_field = $field;
280 next if (($real_field =~ s/^top//) && ($doc_section != 1));
281
282 my $new_text = "";
283
[27328]284 # we get allfields by default
285 next if ($real_field eq "allfields");
[14912]286
287 # metadata - output all metadata we know about except gsdl stuff
[17564]288 # each metadata is in a separate index field
[17567]289 if ($real_field eq "metadata") {
290 # we will process this later, so we are not reindexing metadata already indexed
291 $all_metadata_specified = 1;
[27328]292 next;
[14912]293 }
[17567]294
[14912]295 #individual metadata and or text specified - could be
296 # a comma separated list
[17567]297 $specified_fields->{$real_field} = 1;
[14912]298 my $shortname="";
[17564]299
[33369]300 $shortname = $self->get_or_create_shortname($real_field);
301
302 my @metadata_list = (); # put any meta values in here
[17564]303 my $section_text = ""; # put any text in here
[14912]304 foreach my $submeta (split /,/, $real_field) {
[17564]305 if ($submeta eq "text") {
306 # no point in indexing text more than once
307 if ($section_text eq "") {
308 $section_text = $doc_obj->get_text($section);
309 if ($self->{'indexing_text'}) {
310 if ($paratag ne "") {
311 # we fiddle around with splitting text into paragraphs
312 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
313 }
314 else {
315 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
316 }
[14912]317 }
318 }
319 }
320 else {
[24404]321 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
[17564]322 # its a metadata element
[14912]323 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
324 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
325 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
326 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
327 }
328 }
329 push (@metadata_list, @section_metadata);
330 }
[17564]331 } # for each field in index
332
333
334 # now we add the text and/or the metadata into new_text
335 if ($section_text ne "" || scalar(@metadata_list)) {
[17797]336 if ($self->{'indexing_text'}) {
337 # only add tags in if indexing
338 $new_text .= "$paratag<$shortname>";
339 }
[17564]340 if ($section_text ne "") {
341 $new_text .= "$section_text ";
[17797]342 if ($self->{'indexing_text'} && $paratag ne "" && scalar(@metadata_list)) {
[17564]343 $new_text .= "</$shortname>$paratag<$shortname>";
344 }
345 }
346 foreach my $item (@metadata_list) {
347 $new_text .= "$item ";
348 }
[17797]349 if ($self->{'indexing_text'}) {
350 # only add tags in if indexing
351 $new_text .= "</$shortname>";
[27328]352 $self->{'allindexfields'}->{$real_field} = 1;
[17797]353 }
[14912]354 }
[17564]355
[14912]356 # filter the text
[17110]357 $new_text = $self->filter_text ($field, $new_text);
[14912]358
359 $self->{'num_processed_bytes'} += length ($new_text);
360 $text .= "$new_text";
361 } # foreach field
[17567]362
363 if ($all_metadata_specified) {
364 my $new_text = "";
365 my $shortname = "";
366 my $metadata = $doc_obj->get_all_metadata ($section);
367 foreach my $pair (@$metadata) {
368 my ($mfield, $mvalue) = (@$pair);
369 # no value
370 next unless defined $mvalue && $mvalue ne "";
371 # we have already indexed this
372 next if defined ($specified_fields->{$mfield});
373 # check fields here, maybe others dont want - change to use dontindex!!
374 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
375 next if ($mfield =~ /^gsdl/);
376
[33369]377 $shortname = $self->get_or_create_shortname($mfield);
[27328]378 $self->{'allindexfields'}->{$mfield} = 1;
[17567]379 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
[27328]380 if (!defined $self->{'extraindexfields'}->{$mfield}) {
381 $self->{'extraindexfields'}->{$mfield} = 1;
[17567]382 }
383
384 }
385 # filter the text
386 $new_text = $self->filter_text ("metadata", $new_text);
387
388 $self->{'num_processed_bytes'} += length ($new_text);
389 $text .= "$new_text";
390
391
392 }
393
[14912]394 $text .= "$sectionendtag";
395 $section = $doc_obj->get_next_section($section);
396 } # while defined section
397 print $handle "$text\n$documentendtag";
[17567]398 #print STDERR "***********\n$text\n***************\n";
[14912]399
400}
401
402#chooses the first two letters or digits for the shortname
403#now ignores non-letdig characters
[33369]404sub get_or_create_shortname {
[14912]405 my $self = shift(@_);
406
407 my ($realname) = @_;
[33369]408 if (defined $self->{'fieldnamemap'}->{$realname}) {
409 return $self->{'fieldnamemap'}->{$realname};
410 }
411
[27328]412 my @realnamelist = split(",", $realname);
413 map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
414 my ($singlename) = $realnamelist[0];
415
[17564]416 # try our predefined static mapping
[14912]417 my $shortname;
[33369]418 if (defined ($shortname = $static_indexfield_map{$singlename}) && (! defined $self->{'fieldnamemap'}->{$shortname})) {
419 # we have a shortname from the static list, and it hasn't already been used: we can use the shortname
[14912]420 } else {
[33369]421 # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
422 $realname = join ("", @realnamelist);
423 #try the first two chars
424 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
425 $shortname = "$1$2";
426 } else {
427 # there aren't two letdig's in the field - try arbitrary combinations
428 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
429 $shortname = "AB";
430 }
431 $shortname =~ tr/a-z/A-Z/;
[14912]432
[33369]433 #if already used, take the first and third letdigs and so on
434 my $count = 1;
435 while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
436 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
437 $shortname = "$1$3";
438 $count++;
439 $shortname =~ tr/a-z/A-Z/;
440
441 }
442 else {
443 #remove up to and incl the first letdig
444 $realname =~ s/^[^\w]*\w//;
445 $count = 0;
446 }
[14912]447 }
448 }
[33369]449 # we now have a new shortname.
450 # store it in fieldnamemap
451 $self->{'fieldnamemap'}->{$realname} = $shortname;
452 $self->{'fieldnamemap'}->{$shortname} = 1;
[14912]453
454 return $shortname;
455}
456
4571;
458
Note: See TracBrowser for help on using the repository browser.