root/gsdl/trunk/perllib/mgppbuildproc.pm @ 17110

Revision 17110, 11.4 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgppbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mgpp to process
28
29
30package mgppbuildproc;
31
32use basebuildproc;
33use cnseg;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39BEGIN {
40    @mgppbuildproc::ISA = ('basebuildproc');
41}
42
43#this must be the same as in mgppbuilder
44our %level_map = ('document'=>'Doc',
45          'section'=>'Sec',
46          'paragraph'=>'Para');
47
48sub new {
49    my $class = shift @_;
50    my $self = new basebuildproc (@_);
51
52    # use a different index specification to the default
53    $self->{'index'} = "text";
54
55    $self->{'dontindex'} = {};
56    $self->{'indexfieldmap'} = {};
57    $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
58    $self->{'strip_html'}=1;
59   
60    return bless $self, $class;
61}
62
63
64sub set_indexfieldmap {
65    my $self = shift (@_);
66    my ($indexmap) = @_;
67
68    $self->{'indexfieldmap'} = $indexmap;
69}
70
71sub get_indexfieldmap {
72    my $self = shift (@_);
73
74    return $self->{'indexfieldmap'};
75}
76
77sub set_levels {
78    my $self = shift (@_);
79    my ($levels) = @_;
80
81    $self->{'levels'} = $levels;
82}
83
84sub set_strip_html {
85    my $self = shift (@_);
86    my ($strip) = @_;
87    $self->{'strip_html'}=$strip;
88}
89
90#sub find_paragraphs {
91#    $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
92#}
93
94sub remove_gtlt {
95    my $self =shift(@_);
96    my ($text, $para) = @_;
97    $text =~s/[<>]//g;
98    return "$para$text$para";
99}
100
101sub process_tags {
102    my $self = shift(@_);
103    my ($text, $para) = @_;
104    if ($text =~ /^p\b/i) {
105    return $para;
106    }
107    return "";
108}
109
110sub preprocess_text {
111    my $self = shift (@_);
112    my ($text, $strip_html, $para) = @_;
113    # at this stage, we do not do paragraph tags unless have strip_html -
114    # it will result in a huge mess of non-xml
115    return unless $strip_html;
116   
117    my $new_text = $text;
118   
119    # if we have <pre> tags, we can have < > inside them, need to delete
120    # the <> before stripping tags
121    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
122   
123    if ($para eq "") {
124    # just remove all tags
125    $new_text =~ s/<[^>]*>//gs;
126    } else {
127    # strip all tags except <p> tags which get turned into $para
128    $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
129   
130    }
131    return $new_text;
132}
133#this function strips the html tags from the doc if ($strip_html) and
134# if ($para) replaces <p> with <Paragraph> tags.
135# if both are false, the original text is returned
136#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
137#these tags
138sub preprocess_text_old_and_slow {
139    my $self = shift (@_);
140    my ($text, $strip_html, $para) = @_;
141    my ($outtext) = "";
142    if ($strip_html) {
143    while ($text =~ /<([^>]*)>/ && $text ne "") {
144       
145        my $tag = $1;
146        $outtext .= $`." "; #add everything before the matched tag
147        $text = $'; #'everything after the matched tag
148        if ($para && $tag =~ /^\s*p\s/i) {
149        $outtext .= $para;
150        }
151        elsif ($tag =~ /^pre$/) { # a pre tag
152        $text =~ /<\/pre>/; # find the closing pre tag
153        my $tmp_text = $`; #everything before the closing pre tag
154        $text = $'; #'everything after the </pre>
155        $tmp_text =~ s/[<>]//g; # remove all < and >
156        $outtext.= $tmp_text . " ";
157        }
158    }
159   
160    $outtext .= $text; # add any remaining text
161    return $outtext;
162    } #if strip_html
163
164    #if ($para) {
165    #$text =~ s/(<p\b)/$para$1/gi;
166    #return $text;
167   # }
168    return $text;
169}
170   
171sub text {
172    my $self = shift (@_);
173    my ($doc_obj) = @_;
174    my $handle = $self->{'output_handle'};
175    my $outhandle = $self->{'outhandle'};
176
177    # only output this document if it is one to be indexed
178    return if ($doc_obj->get_doc_type() ne "indexed_doc");
179   
180    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
181   
182    # this is another document
183    $self->{'num_docs'} += 1;
184
185    # get the parameters for the output
186    # split on : just in case there is subcoll and lang stuff
187    my ($fields) = split (/:/, $self->{'index'});
188
189    # we always do text and index on Doc and Sec levels
190    my ($documenttag) = "\n<". $level_map{'document'} . ">\n";
191    my ($documentendtag) = "\n</". $level_map{'document'} . ">\n";
192    my ($sectiontag) = "\n<". $level_map{'section'} . ">\n";
193    my ($sectionendtag) = "\n</". $level_map{'section'} . ">\n";
194
195    my ($paratag) = "";
196   
197    # paragraph tags will only be used for indexing (can't retrieve
198    # paragraphs), and can ony be used if we are stripping HTML tags
199    if ($self->{'indexing_text'} && $self->{'levels'}->{'paragraph'}) {
200    if ($self->{'strip_html'}) {
201        $paratag = "<". $level_map{'paragraph'} . ">";
202    } else {
203        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
204    }
205    }
206
207    my $doc_section = 0; # just for this document
208   
209    my $text = $documenttag;
210   
211    # get the text for this document
212    my $section = $doc_obj->get_top_section();
213   
214    while (defined $section) {
215    # update a few statistics
216    $doc_section++;
217    $self->{'num_sections'} += 1;
218    $text .= "$sectiontag";
219   
220    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
221    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
222        # we are not actually indexing anything for this document,
223        # but we want to keep the section numbers the same, so we just
224        # output section tags for each section (which is done above)
225        $text .= "$sectionendtag";
226        $section = $doc_obj->get_next_section($section);
227        next;
228    }
229   
230    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
231    foreach my $field (split (/;/, $fields)) {
232        # only deal with this field if it doesn't start with top or
233        # this is the first section
234        my $real_field = $field;
235        next if (($real_field =~ s/^top//) && ($doc_section != 1));
236       
237        my $new_text = "";
238
239        # we get allfields by default - do nothing
240        if ($real_field eq "allfields") {
241   
242        }
243       
244        # metadata - output all metadata we know about except gsdl stuff
245        elsif ($real_field eq "metadata") {
246        my $shortname = "";
247        my $metadata = $doc_obj->get_all_metadata ($section);
248        foreach my $pair (@$metadata) {
249            my ($mfield, $mvalue) = (@$pair);
250            # check fields here, maybe others dont want - change to use dontindex!!
251            if ($mfield ne "Identifier"
252            && $mfield !~ /^gsdl/
253            && $mfield ne "classifytype"
254            && $mfield ne "assocfilepath"
255            && defined $mvalue && $mvalue ne "") {
256           
257            if (defined $self->{'indexfieldmap'}->{$mfield}) {
258                $shortname = $self->{'indexfieldmap'}->{$mfield};
259            }
260            else {
261                $shortname = $self->create_shortname($mfield);
262                $self->{'indexfieldmap'}->{$mfield} = $shortname;
263                $self->{'indexfieldmap'}->{$shortname} = 1;
264            }     
265            $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
266            if (!defined $self->{'indexfields'}->{$mfield}) {
267                $self->{'indexfields'}->{$mfield} = 1;
268            }                   
269            }
270        }
271        }
272        else {
273        #individual metadata and or text specified - could be
274        # a comma separated list
275        my $shortname="";
276        if (defined $self->{'indexfieldmap'}->{$real_field}) {
277            $shortname = $self->{'indexfieldmap'}->{$real_field};
278        }
279        else {
280            $shortname = $self->create_shortname($real_field);
281            $self->{'indexfieldmap'}->{$real_field} = $shortname;
282            $self->{'indexfieldmap'}->{$shortname} = 1;
283        }
284        my @metadata_list = ();
285        foreach my $submeta (split /,/, $real_field) {
286            if ($submeta eq "text") {
287            my $section_text = $doc_obj->get_text($section);
288            if ($self->{'indexing_text'}) {
289                # tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
290                $new_text .= "$paratag<$shortname>\n";
291                if ($paratag ne "") {
292                $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
293                }
294                else {
295                $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
296                }
297                $new_text .= "$section_text</$shortname>\n";
298            }
299            else {
300                            # leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
301                $new_text .= $section_text;
302            }
303            }
304            else {
305            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
306            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
307                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
308                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
309                }
310            }
311            push (@metadata_list, @section_metadata);
312            }
313        }
314        foreach my $item (@metadata_list) {
315            $new_text .= "$paratag<$shortname>$item</$shortname>\n";
316        }
317        }
318       
319        # filter the text
320        $new_text = $self->filter_text ($field, $new_text);
321       
322        $self->{'num_processed_bytes'} += length ($new_text);
323        $text .= "$new_text";
324    } # foreach field
325   
326    $text .= "$sectionendtag";
327    $section = $doc_obj->get_next_section($section);
328    } # while defined section
329    print $handle "$text\n$documentendtag";
330   
331}
332
333#chooses the first two letters or digits for the shortname
334#now ignores non-letdig characters
335sub create_shortname {
336    my $self = shift(@_);
337   
338    my ($realname) = @_;
339    #take the first two chars
340    my $shortname;
341    if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
342    $shortname = "$1$2";
343    } else {
344    # there aren't two letdig's in the field - try arbitrary combinations
345    $realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
346    $shortname = "AB";
347    }
348    $shortname =~ tr/a-z/A-Z/;
349
350    #if already used, take the first and third letdigs and so on
351    my $count = 1;
352    while (defined $self->{'indexfieldmap'}->{$shortname}) {
353    if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
354        $shortname = "$1$3";
355        $count++;
356        $shortname =~ tr/a-z/A-Z/;
357   
358    }
359    else {
360        #remove up to and incl the first letdig
361        $realname =~ s/^[^\w]*\w//;
362        $count = 0;
363    }
364    }
365
366    return $shortname;
367}
368
3691;
370
Note: See TracBrowser for help on using the browser.