source: trunk/gsdl/perllib/mgppbuildproc.pm@ 10113

Last change on this file since 10113 was 9919, checked in by kjdon, 19 years ago

made a base buildproc class, and shifted most of the buildproc code into it. mainly the subclasses just need to implement the text method

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
RevLine 
[932]1###########################################################################
2#
[1852]3# mgppbuildproc.pm --
[932]4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
[1852]27# for mgpp to process
[932]28
29
30package mgppbuildproc;
31
[9919]32use basebuildproc;
[3767]33
[932]34
35BEGIN {
[9919]36 @mgppbuildproc::ISA = ('basebuildproc');
[932]37}
38
[4811]39#this must be the same as in mgppbuilder
[9157]40our %level_map = ('document'=>'Doc',
[9919]41 'section'=>'Sec',
42 'paragraph'=>'Para');
[932]43
44sub new {
[9919]45 my $class = shift @_;
46 my $self = new basebuildproc (@_);
[932]47
[9919]48 # use a different index specification to the default
[932]49 $self->{'index'} = "text";
[2771]50
[1852]51 $self->{'dontindex'} = {};
52 $self->{'indexfieldmap'} = {};
[4769]53 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
[1852]54 $self->{'strip_html'}=1;
[9919]55
[932]56 return bless $self, $class;
57}
58
59
[1852]60sub set_indexfieldmap {
61 my $self = shift (@_);
62 my ($indexmap) = @_;
63
64 $self->{'indexfieldmap'} = $indexmap;
65}
66
67sub get_indexfieldmap {
68 my $self = shift (@_);
69
70 return $self->{'indexfieldmap'};
71}
72
73sub set_levels {
74 my $self = shift (@_);
75 my ($levels) = @_;
76
77 $self->{'levels'} = $levels;
78}
79
80sub set_strip_html {
81 my $self = shift (@_);
82 my ($strip) = @_;
83 $self->{'strip_html'}=$strip;
84}
85
[932]86
[9919]87sub get_gdbm_level {
[932]88 my $self = shift (@_);
89
[1852]90 #if a Section level index is not built, the gdbm file should be at doc
91 #level not Section
[4811]92 if ($self->{'levels'}->{'section'}) {
[9919]93 return "section";
[1852]94 }
[9919]95 return "document";
96}
[1852]97
[932]98
[4769]99#sub find_paragraphs {
100# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
101#}
[932]102
[1852]103#this function strips the html tags from the doc if ($strip_html) and
104# if ($para) replaces <p> with <Paragraph> tags.
105# if both are false, the original text is returned
106#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
107#these tags
108sub preprocess_text {
109 my $self = shift (@_);
110 my ($text, $strip_html, $para) = @_;
111 my ($outtext) = "";
112 if ($strip_html) {
113 while ($text =~ /<([^>]*)>/ && $text ne "") {
114
[8716]115 my $tag = $1;
[1852]116 $outtext .= $`." "; #add everything before the matched tag
[9669]117 $text = $'; #'everything after the matched tag
[4769]118 if ($para && $tag =~ /^\s*p\s/i) {
119 $outtext .= $para;
[1852]120 }
121 elsif ($tag =~ /^pre$/) { # a pre tag
122 $text =~ /<\/pre>/; # find the closing pre tag
123 my $tmp_text = $`; #everything before the closing pre tag
[9669]124 $text = $'; #'everything after the </pre>
[1852]125 $tmp_text =~ s/[<>]//g; # remove all < and >
126 $outtext.= $tmp_text . " ";
127 }
128 }
129
130 $outtext .= $text; # add any remaining text
131 return $outtext;
132 } #if strip_html
133
[4769]134 #if ($para) {
135 #$text =~ s/(<p\b)/$para$1/gi;
136 #return $text;
137 # }
[1852]138 return $text;
139}
140
141
142
[932]143sub filter_text {
144 # $self->filter_text ($field, $new_text);
145 # don't want to do anything for this version, however,
146 # in a particular collection you might want to override
147 # this method to post-process certain fields depending on
148 # the field, or whether we are outputting it for indexing
149}
150
151sub text {
152 my $self = shift (@_);
153 my ($doc_obj) = @_;
154 my $handle = $self->{'output_handle'};
[4769]155 my $outhandle = $self->{'outhandle'};
[932]156
157 # only output this document if it is one to be indexed
158 return if ($doc_obj->get_doc_type() ne "indexed_doc");
159
[9919]160 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
161
[932]162 # this is another document
163 $self->{'num_docs'} += 1;
164
165 # get the parameters for the output
[5642]166 # split on : just in case there is subcoll and lang stuff
167 my ($fields) = split (/:/, $self->{'index'});
[932]168
[4769]169 my ($documenttag) = "";
170 my($documentendtag) = "";
[4811]171 if ($self->{'levels'}->{'document'}) {
[7090]172 $documenttag = "\n<". $level_map{'document'} . ">\n";
173 $documentendtag = "\n</". $level_map{'document'} . ">\n";
[4811]174 }
[1852]175 my ($sectiontag) = "";
[4811]176 if ($self->{'levels'}->{'section'}) {
[7090]177 $sectiontag = "\n<". $level_map{'section'} . ">\n";
[1852]178 }
179 my ($paratag) = "";
[9669]180
[4811]181 if ($self->{'levels'}->{'paragraph'}) {
[4769]182 if ($self->{'strip_html'}) {
[7090]183 $paratag = "<". $level_map{'paragraph'} . ">";
[4769]184 } else {
185 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
186 }
[1852]187 }
[4811]188
[932]189 my $doc_section = 0; # just for this document
[4769]190
191 my $text = $documenttag;
[1917]192
[932]193 # get the text for this document
194 my $section = $doc_obj->get_top_section();
195 while (defined $section) {
196 # update a few statistics
197 $doc_section++;
198 $self->{'num_sections'} += 1;
[4769]199 $text .= "$sectiontag";
[1852]200
[932]201 if ($indexed_doc) {
[4769]202 if ($self->{'indexing_text'}) {
203 $text .= "$paratag"; # only add para tags for indexing
204 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
205 }
[932]206 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[1917]207 foreach my $field (split (/,/, $fields)) {
[932]208 # only deal with this field if it doesn't start with top or
209 # this is the first section
210 my $real_field = $field;
211 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
212 my $new_text = "";
[1852]213 my $tmp_text = "";
[932]214 if ($real_field eq "text") {
[1852]215 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
[4769]216 $new_text .= "$paratag<TX>\n";
217 $tmp_text .= $doc_obj->get_text ($section);
218 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
219
[1852]220 $new_text .= "$tmp_text</TX>\n";
[4769]221 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
222 #$self->{'indexfields'}->{'TextOnly'} = 1;
223 #}
[1852]224 }
225 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
[2480]226 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
[1852]227 }
[932]228 } else { # metadata field
[4769]229 if ($real_field eq "allfields") { #ignore
230 }
231 elsif ($real_field eq "metadata") { # insert all metadata
[1852]232 #except gsdl stuff
233 my $shortname = "";
[932]234 my $metadata = $doc_obj->get_all_metadata ($section);
[8716]235 foreach my $pair (@$metadata) {
[932]236 my ($mfield, $mvalue) = (@$pair);
[1852]237 # check fields here, maybe others dont want - change to use dontindex!!
[6546]238 if ($mfield ne "Identifier"
239 && $mfield !~ /^gsdl/
[3226]240 && $mfield ne "classifytype"
241 && $mfield ne "assocfilepath"
242 && defined $mvalue && $mvalue ne "") {
[1852]243
244 if (defined $self->{'indexfieldmap'}->{$mfield}) {
245 $shortname = $self->{'indexfieldmap'}->{$mfield};
246 }
247 else {
248 $shortname = $self->create_shortname($mfield);
249 $self->{'indexfieldmap'}->{$mfield} = $shortname;
250 $self->{'indexfieldmap'}->{$shortname} = 1;
251 }
252 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
253 if (!defined $self->{'indexfields'}->{$mfield}) {
254 $self->{'indexfields'}->{$mfield} = 1;
255 }
[932]256 }
257 }
[1852]258
[932]259 }
260 else { #individual metadata specified
[1852]261 my $shortname="";
[4769]262 #if (!defined $self->{'indexfields'}->{$real_field}) {
263 #$self->{'indexfields'}->{$real_field} = 1;
264 #}
[1852]265 if (defined $self->{'indexfieldmap'}->{$real_field}) {
266 $shortname = $self->{'indexfieldmap'}->{$real_field};
267 }
268 else {
269 $shortname = $self->create_shortname($real_field);
270 $self->{'indexfieldmap'}->{$real_field} = $shortname;
271 $self->{'indexfieldmap'}->{$shortname} = 1;
272 }
[8716]273 foreach my $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
[1852]274 $new_text .= "$paratag<$shortname>$item</$shortname>\n";
[932]275 }
276 }
277
278 }
279
280 # filter the text
281 $self->filter_text ($field, $new_text);
282
[1852]283 $self->{'num_processed_bytes'} += length ($new_text);
[932]284 $text .= "$new_text";
285 }
286 }
287 } # if (indexed_doc)
288
289 $section = $doc_obj->get_next_section($section);
290 } #while defined section
[4769]291 print $handle "$text\n$documentendtag";
292
[932]293}
294
[3195]295#chooses the first two letters or digits for the shortname
296#now ignores non-letdig characters
[1852]297sub create_shortname {
[8716]298 my $self = shift(@_);
[1852]299
300 my ($realname) = @_;
301 #take the first two chars
[3195]302 my $shortname;
303 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
304 $shortname = "$1$2";
305 } else {
306 # there aren't two letdig's in the field - try arbitrary combinations
307 $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
308 $shortname = "AB";
309 }
[1852]310 $shortname =~ tr/a-z/A-Z/;
311
[3195]312 #if already used, take the first and third letdigs and so on
[8716]313 my $count = 1;
[1852]314 while (defined $self->{'indexfieldmap'}->{$shortname}) {
[3195]315 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
316 $shortname = "$1$3";
317 $count++;
318 $shortname =~ tr/a-z/A-Z/;
[1852]319
320 }
321 else {
[3195]322 #remove up to and incl the first letdig
323 $realname =~ s/^[^\w]*\w//;
[1852]324 $count = 0;
325 }
326 }
327
328 return $shortname;
329}
330
[932]3311;
332
Note: See TracBrowser for help on using the repository browser.