source: trunk/gsdl/perllib/mgppbuildproc.pm@ 10113

Last change on this file since 10113 was 9919, checked in by kjdon, 19 years ago

made a base buildproc class, and shifted most of the buildproc code into it. mainly the subclasses just need to implement the text method

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33
34
35BEGIN {
36 @mgppbuildproc::ISA = ('basebuildproc');
37}
38
39#this must be the same as in mgppbuilder
40our %level_map = ('document'=>'Doc',
41 'section'=>'Sec',
42 'paragraph'=>'Para');
43
44sub new {
45 my $class = shift @_;
46 my $self = new basebuildproc (@_);
47
48 # use a different index specification to the default
49 $self->{'index'} = "text";
50
51 $self->{'dontindex'} = {};
52 $self->{'indexfieldmap'} = {};
53 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
54 $self->{'strip_html'}=1;
55
56 return bless $self, $class;
57}
58
59
60sub set_indexfieldmap {
61 my $self = shift (@_);
62 my ($indexmap) = @_;
63
64 $self->{'indexfieldmap'} = $indexmap;
65}
66
67sub get_indexfieldmap {
68 my $self = shift (@_);
69
70 return $self->{'indexfieldmap'};
71}
72
73sub set_levels {
74 my $self = shift (@_);
75 my ($levels) = @_;
76
77 $self->{'levels'} = $levels;
78}
79
80sub set_strip_html {
81 my $self = shift (@_);
82 my ($strip) = @_;
83 $self->{'strip_html'}=$strip;
84}
85
86
87sub get_gdbm_level {
88 my $self = shift (@_);
89
90 #if a Section level index is not built, the gdbm file should be at doc
91 #level not Section
92 if ($self->{'levels'}->{'section'}) {
93 return "section";
94 }
95 return "document";
96}
97
98
99#sub find_paragraphs {
100# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
101#}
102
103#this function strips the html tags from the doc if ($strip_html) and
104# if ($para) replaces <p> with <Paragraph> tags.
105# if both are false, the original text is returned
106#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
107#these tags
108sub preprocess_text {
109 my $self = shift (@_);
110 my ($text, $strip_html, $para) = @_;
111 my ($outtext) = "";
112 if ($strip_html) {
113 while ($text =~ /<([^>]*)>/ && $text ne "") {
114
115 my $tag = $1;
116 $outtext .= $`." "; #add everything before the matched tag
117 $text = $'; #'everything after the matched tag
118 if ($para && $tag =~ /^\s*p\s/i) {
119 $outtext .= $para;
120 }
121 elsif ($tag =~ /^pre$/) { # a pre tag
122 $text =~ /<\/pre>/; # find the closing pre tag
123 my $tmp_text = $`; #everything before the closing pre tag
124 $text = $'; #'everything after the </pre>
125 $tmp_text =~ s/[<>]//g; # remove all < and >
126 $outtext.= $tmp_text . " ";
127 }
128 }
129
130 $outtext .= $text; # add any remaining text
131 return $outtext;
132 } #if strip_html
133
134 #if ($para) {
135 #$text =~ s/(<p\b)/$para$1/gi;
136 #return $text;
137 # }
138 return $text;
139}
140
141
142
143sub filter_text {
144 # $self->filter_text ($field, $new_text);
145 # don't want to do anything for this version, however,
146 # in a particular collection you might want to override
147 # this method to post-process certain fields depending on
148 # the field, or whether we are outputting it for indexing
149}
150
151sub text {
152 my $self = shift (@_);
153 my ($doc_obj) = @_;
154 my $handle = $self->{'output_handle'};
155 my $outhandle = $self->{'outhandle'};
156
157 # only output this document if it is one to be indexed
158 return if ($doc_obj->get_doc_type() ne "indexed_doc");
159
160 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
161
162 # this is another document
163 $self->{'num_docs'} += 1;
164
165 # get the parameters for the output
166 # split on : just in case there is subcoll and lang stuff
167 my ($fields) = split (/:/, $self->{'index'});
168
169 my ($documenttag) = "";
170 my($documentendtag) = "";
171 if ($self->{'levels'}->{'document'}) {
172 $documenttag = "\n<". $level_map{'document'} . ">\n";
173 $documentendtag = "\n</". $level_map{'document'} . ">\n";
174 }
175 my ($sectiontag) = "";
176 if ($self->{'levels'}->{'section'}) {
177 $sectiontag = "\n<". $level_map{'section'} . ">\n";
178 }
179 my ($paratag) = "";
180
181 if ($self->{'levels'}->{'paragraph'}) {
182 if ($self->{'strip_html'}) {
183 $paratag = "<". $level_map{'paragraph'} . ">";
184 } else {
185 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
186 }
187 }
188
189 my $doc_section = 0; # just for this document
190
191 my $text = $documenttag;
192
193 # get the text for this document
194 my $section = $doc_obj->get_top_section();
195 while (defined $section) {
196 # update a few statistics
197 $doc_section++;
198 $self->{'num_sections'} += 1;
199 $text .= "$sectiontag";
200
201 if ($indexed_doc) {
202 if ($self->{'indexing_text'}) {
203 $text .= "$paratag"; # only add para tags for indexing
204 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
205 }
206 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
207 foreach my $field (split (/,/, $fields)) {
208 # only deal with this field if it doesn't start with top or
209 # this is the first section
210 my $real_field = $field;
211 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
212 my $new_text = "";
213 my $tmp_text = "";
214 if ($real_field eq "text") {
215 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
216 $new_text .= "$paratag<TX>\n";
217 $tmp_text .= $doc_obj->get_text ($section);
218 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
219
220 $new_text .= "$tmp_text</TX>\n";
221 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
222 #$self->{'indexfields'}->{'TextOnly'} = 1;
223 #}
224 }
225 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
226 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
227 }
228 } else { # metadata field
229 if ($real_field eq "allfields") { #ignore
230 }
231 elsif ($real_field eq "metadata") { # insert all metadata
232 #except gsdl stuff
233 my $shortname = "";
234 my $metadata = $doc_obj->get_all_metadata ($section);
235 foreach my $pair (@$metadata) {
236 my ($mfield, $mvalue) = (@$pair);
237 # check fields here, maybe others dont want - change to use dontindex!!
238 if ($mfield ne "Identifier"
239 && $mfield !~ /^gsdl/
240 && $mfield ne "classifytype"
241 && $mfield ne "assocfilepath"
242 && defined $mvalue && $mvalue ne "") {
243
244 if (defined $self->{'indexfieldmap'}->{$mfield}) {
245 $shortname = $self->{'indexfieldmap'}->{$mfield};
246 }
247 else {
248 $shortname = $self->create_shortname($mfield);
249 $self->{'indexfieldmap'}->{$mfield} = $shortname;
250 $self->{'indexfieldmap'}->{$shortname} = 1;
251 }
252 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
253 if (!defined $self->{'indexfields'}->{$mfield}) {
254 $self->{'indexfields'}->{$mfield} = 1;
255 }
256 }
257 }
258
259 }
260 else { #individual metadata specified
261 my $shortname="";
262 #if (!defined $self->{'indexfields'}->{$real_field}) {
263 #$self->{'indexfields'}->{$real_field} = 1;
264 #}
265 if (defined $self->{'indexfieldmap'}->{$real_field}) {
266 $shortname = $self->{'indexfieldmap'}->{$real_field};
267 }
268 else {
269 $shortname = $self->create_shortname($real_field);
270 $self->{'indexfieldmap'}->{$real_field} = $shortname;
271 $self->{'indexfieldmap'}->{$shortname} = 1;
272 }
273 foreach my $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
274 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
275 }
276 }
277
278 }
279
280 # filter the text
281 $self->filter_text ($field, $new_text);
282
283 $self->{'num_processed_bytes'} += length ($new_text);
284 $text .= "$new_text";
285 }
286 }
287 } # if (indexed_doc)
288
289 $section = $doc_obj->get_next_section($section);
290 } #while defined section
291 print $handle "$text\n$documentendtag";
292
293}
294
295#chooses the first two letters or digits for the shortname
296#now ignores non-letdig characters
297sub create_shortname {
298 my $self = shift(@_);
299
300 my ($realname) = @_;
301 #take the first two chars
302 my $shortname;
303 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
304 $shortname = "$1$2";
305 } else {
306 # there aren't two letdig's in the field - try arbitrary combinations
307 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
308 $shortname = "AB";
309 }
310 $shortname =~ tr/a-z/A-Z/;
311
312 #if already used, take the first and third letdigs and so on
313 my $count = 1;
314 while (defined $self->{'indexfieldmap'}->{$shortname}) {
315 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
316 $shortname = "$1$3";
317 $count++;
318 $shortname =~ tr/a-z/A-Z/;
319
320 }
321 else {
322 #remove up to and incl the first letdig
323 $realname =~ s/^[^\w]*\w//;
324 $count = 0;
325 }
326 }
327
328 return $shortname;
329}
330
3311;
332
Note: See TracBrowser for help on using the repository browser.