source: gsdl/tags/gsdl-2_62-distribution/gsdl/perllib/mgppbuildproc.pm@ 14162

Last change on this file since 14162 was 10474, checked in by kjdon, 19 years ago

implemented sections_index_document_metadata

  • Property svn:keywords set to Author Date Id Revision
File size: 10.1 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33
34BEGIN {
35 @mgppbuildproc::ISA = ('basebuildproc');
36}
37
38#this must be the same as in mgppbuilder
39our %level_map = ('document'=>'Doc',
40 'section'=>'Sec',
41 'paragraph'=>'Para');
42
43sub new {
44 my $class = shift @_;
45 my $self = new basebuildproc (@_);
46
47 # use a different index specification to the default
48 $self->{'index'} = "text";
49
50 $self->{'dontindex'} = {};
51 $self->{'indexfieldmap'} = {};
52 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
53 $self->{'strip_html'}=1;
54
55 return bless $self, $class;
56}
57
58
59sub set_indexfieldmap {
60 my $self = shift (@_);
61 my ($indexmap) = @_;
62
63 $self->{'indexfieldmap'} = $indexmap;
64}
65
66sub get_indexfieldmap {
67 my $self = shift (@_);
68
69 return $self->{'indexfieldmap'};
70}
71
72sub set_levels {
73 my $self = shift (@_);
74 my ($levels) = @_;
75
76 $self->{'levels'} = $levels;
77}
78
79sub set_strip_html {
80 my $self = shift (@_);
81 my ($strip) = @_;
82 $self->{'strip_html'}=$strip;
83}
84
85
86sub get_gdbm_level {
87 my $self = shift (@_);
88
89 #if a Section level index is not built, the gdbm file should be at doc
90 #level not Section
91 if ($self->{'levels'}->{'section'}) {
92 return "section";
93 }
94 return "document";
95}
96
97
98#sub find_paragraphs {
99# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
100#}
101
102#this function strips the html tags from the doc if ($strip_html) and
103# if ($para) replaces <p> with <Paragraph> tags.
104# if both are false, the original text is returned
105#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
106#these tags
107sub preprocess_text {
108 my $self = shift (@_);
109 my ($text, $strip_html, $para) = @_;
110 my ($outtext) = "";
111 if ($strip_html) {
112 while ($text =~ /<([^>]*)>/ && $text ne "") {
113
114 my $tag = $1;
115 $outtext .= $`." "; #add everything before the matched tag
116 $text = $'; #'everything after the matched tag
117 if ($para && $tag =~ /^\s*p\s/i) {
118 $outtext .= $para;
119 }
120 elsif ($tag =~ /^pre$/) { # a pre tag
121 $text =~ /<\/pre>/; # find the closing pre tag
122 my $tmp_text = $`; #everything before the closing pre tag
123 $text = $'; #'everything after the </pre>
124 $tmp_text =~ s/[<>]//g; # remove all < and >
125 $outtext.= $tmp_text . " ";
126 }
127 }
128
129 $outtext .= $text; # add any remaining text
130 return $outtext;
131 } #if strip_html
132
133 #if ($para) {
134 #$text =~ s/(<p\b)/$para$1/gi;
135 #return $text;
136 # }
137 return $text;
138}
139
140
141
142sub filter_text {
143 # $self->filter_text ($field, $new_text);
144 # don't want to do anything for this version, however,
145 # in a particular collection you might want to override
146 # this method to post-process certain fields depending on
147 # the field, or whether we are outputting it for indexing
148}
149
150sub text {
151 my $self = shift (@_);
152 my ($doc_obj) = @_;
153 my $handle = $self->{'output_handle'};
154 my $outhandle = $self->{'outhandle'};
155
156 # only output this document if it is one to be indexed
157 return if ($doc_obj->get_doc_type() ne "indexed_doc");
158
159 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
160
161 # this is another document
162 $self->{'num_docs'} += 1;
163
164 # get the parameters for the output
165 # split on : just in case there is subcoll and lang stuff
166 my ($fields) = split (/:/, $self->{'index'});
167
168 my ($documenttag) = "";
169 my($documentendtag) = "";
170 if ($self->{'levels'}->{'document'}) {
171 $documenttag = "\n<". $level_map{'document'} . ">\n";
172 $documentendtag = "\n</". $level_map{'document'} . ">\n";
173 }
174 my ($sectiontag) = "";
175 if ($self->{'levels'}->{'section'}) {
176 $sectiontag = "\n<". $level_map{'section'} . ">\n";
177 }
178 my ($paratag) = "";
179
180 if ($self->{'levels'}->{'paragraph'}) {
181 if ($self->{'strip_html'}) {
182 $paratag = "<". $level_map{'paragraph'} . ">";
183 } else {
184 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
185 }
186 }
187
188 my $doc_section = 0; # just for this document
189
190 my $text = $documenttag;
191
192 # get the text for this document
193 my $section = $doc_obj->get_top_section();
194 while (defined $section) {
195 # update a few statistics
196 $doc_section++;
197 $self->{'num_sections'} += 1;
198 $text .= "$sectiontag";
199
200 if ($indexed_doc) {
201 if ($self->{'indexing_text'}) {
202 $text .= "$paratag"; # only add para tags for indexing
203 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
204 }
205 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
206 foreach my $field (split (/,/, $fields)) {
207 # only deal with this field if it doesn't start with top or
208 # this is the first section
209 my $real_field = $field;
210 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
211 my $new_text = "";
212 my $tmp_text = "";
213 if ($real_field eq "text") {
214 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
215 $new_text .= "$paratag<TX>\n";
216 $tmp_text .= $doc_obj->get_text ($section);
217 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
218
219 $new_text .= "$tmp_text</TX>\n";
220 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
221 #$self->{'indexfields'}->{'TextOnly'} = 1;
222 #}
223 }
224 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
225 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
226 }
227 } else { # metadata field
228 if ($real_field eq "allfields") { #ignore
229 }
230 elsif ($real_field eq "metadata") { # insert all metadata
231 #except gsdl stuff
232 my $shortname = "";
233 my $metadata = $doc_obj->get_all_metadata ($section);
234 foreach my $pair (@$metadata) {
235 my ($mfield, $mvalue) = (@$pair);
236 # check fields here, maybe others dont want - change to use dontindex!!
237 if ($mfield ne "Identifier"
238 && $mfield !~ /^gsdl/
239 && $mfield ne "classifytype"
240 && $mfield ne "assocfilepath"
241 && defined $mvalue && $mvalue ne "") {
242
243 if (defined $self->{'indexfieldmap'}->{$mfield}) {
244 $shortname = $self->{'indexfieldmap'}->{$mfield};
245 }
246 else {
247 $shortname = $self->create_shortname($mfield);
248 $self->{'indexfieldmap'}->{$mfield} = $shortname;
249 $self->{'indexfieldmap'}->{$shortname} = 1;
250 }
251 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
252 if (!defined $self->{'indexfields'}->{$mfield}) {
253 $self->{'indexfields'}->{$mfield} = 1;
254 }
255 }
256 }
257
258 }
259 else { #individual metadata specified
260 my $shortname="";
261 #if (!defined $self->{'indexfields'}->{$real_field}) {
262 #$self->{'indexfields'}->{$real_field} = 1;
263 #}
264 if (defined $self->{'indexfieldmap'}->{$real_field}) {
265 $shortname = $self->{'indexfieldmap'}->{$real_field};
266 }
267 else {
268 $shortname = $self->create_shortname($real_field);
269 $self->{'indexfieldmap'}->{$real_field} = $shortname;
270 $self->{'indexfieldmap'}->{$shortname} = 1;
271 }
272 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
273 if ($self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
274 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
275 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
276 }
277 }
278 foreach my $item (@section_metadata) {
279 #foreach my $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
280 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
281 }
282 }
283
284 }
285
286 # filter the text
287 $self->filter_text ($field, $new_text);
288
289 $self->{'num_processed_bytes'} += length ($new_text);
290 $text .= "$new_text";
291 }
292 }
293 } # if (indexed_doc)
294
295 $section = $doc_obj->get_next_section($section);
296 } #while defined section
297 print $handle "$text\n$documentendtag";
298
299}
300
301#chooses the first two letters or digits for the shortname
302#now ignores non-letdig characters
303sub create_shortname {
304 my $self = shift(@_);
305
306 my ($realname) = @_;
307 #take the first two chars
308 my $shortname;
309 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
310 $shortname = "$1$2";
311 } else {
312 # there aren't two letdig's in the field - try arbitrary combinations
313 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
314 $shortname = "AB";
315 }
316 $shortname =~ tr/a-z/A-Z/;
317
318 #if already used, take the first and third letdigs and so on
319 my $count = 1;
320 while (defined $self->{'indexfieldmap'}->{$shortname}) {
321 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
322 $shortname = "$1$3";
323 $count++;
324 $shortname =~ tr/a-z/A-Z/;
325
326 }
327 else {
328 #remove up to and incl the first letdig
329 $realname =~ s/^[^\w]*\w//;
330 $count = 0;
331 }
332 }
333
334 return $shortname;
335}
336
3371;
338
Note: See TracBrowser for help on using the repository browser.