root/gsdl/trunk/perllib/mgbuildproc.pm @ 17110

Revision 17110, 4.6 KB (checked in by kjdon, 11 years ago)

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29package mgbuildproc;
30
31
32use basebuildproc;
33use strict;
34
35
36BEGIN {
37    @mgbuildproc::ISA = ('basebuildproc');
38}
39
40sub new {
41    my $class = shift @_;
42    my $self = new basebuildproc (@_);
43    return bless $self, $class;
44}
45
46
47sub find_paragraphs {
48    $_[1] =~ s/(<p\b)/\cC$1/gi;
49}
50
51sub text {
52    my $self = shift (@_);
53    my ($doc_obj) = @_;
54    my $handle = $self->{'output_handle'};
55   
56    # only output this document if it is one to be indexed
57    return if ($doc_obj->get_doc_type() ne "indexed_doc");
58   
59    # see if this document belongs to this subcollection
60    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
61
62    # this is another document
63    $self->{'num_docs'} += 1;
64
65    # get the parameters for the output
66    my ($level, $fields) = split (/:/, $self->{'index'});
67    $fields =~ s/\ball\b/Title,Creator,text/;
68    $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
69
70    my $doc_section = 0; # just for this document
71    my $text = "";
72    my $text_extra = "";
73
74    # get the text for this document
75    my $section = $doc_obj->get_top_section();
76    while (defined $section) {
77    # update a few statistics
78    $doc_section++;
79    $self->{'num_sections'} += 1;
80
81    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
82    if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
83        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
84        foreach my $field (split (/,/, $fields)) {
85        # only deal with this field if it doesn't start with top or
86        # this is the first section
87        my $real_field = $field;
88        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
89            my $new_text = "";
90            if ($level eq "dummy") {
91            # a dummy index is a special case used when no
92            # indexes are specified (since there must always be
93            # at least one index or we can't retrieve the
94            # compressed text) - we add a small amount of text
95            # to these dummy indexes which will never be seen
96            # but will overcome mg's problems with building
97            # empty indexes
98            $new_text = "this is dummy text to stop mg barfing";
99            $self->{'num_processed_bytes'} += length ($new_text);
100
101            } elsif ($real_field eq "text") {
102            $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
103            $self->{'num_processed_bytes'} += length ($new_text);
104            $new_text =~ s/[\cB\cC]//g;
105            $self->find_paragraphs($new_text);
106           
107            } else {
108            my $first = 1;
109            my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
110            if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
111                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
112                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
113                }
114            }
115            foreach my $meta (@section_metadata) {
116                $meta =~ s/[\cB\cC]//g;
117                $self->{'num_processed_bytes'} += length ($meta);
118                $new_text .= "\cC" unless $first;
119                $new_text .= $meta if $self->{'store_text'};
120                $first = 0;
121            }
122            }
123           
124            # filter the text
125            $new_text = $self->filter_text ($field, $new_text);
126
127            $text .= "$new_text\cC";
128        }
129        }
130    }
131   
132    if ($level eq "document") { $text_extra .= "\cB"; }
133    else { $text .= "\cB"; }
134   
135    $section = $doc_obj->get_next_section($section);
136    }
137
138    print $handle "$text$text_extra";
139}
140
1411;
142
Note: See TracBrowser for help on using the browser.