root/gsdl/trunk/perllib/mgbuildproc.pm @ 15738

Revision 15738, 4.9 KB (checked in by mdewsnip, 12 years ago)

Added "use strict".

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29package mgbuildproc;
30
31
32use basebuildproc;
33use strict;
34
35
36BEGIN {
37    @mgbuildproc::ISA = ('basebuildproc');
38}
39
40sub new {
41    my $class = shift @_;
42    my $self = new basebuildproc (@_);
43    return bless $self, $class;
44}
45
46
47sub find_paragraphs {
48    $_[1] =~ s/(<p\b)/\cC$1/gi;
49}
50
51sub filter_text {
52    # $self->filter_text ($field, $new_text);
53    # don't want to do anything for this version, however,
54    # in a particular collection you might want to override
55    # this method to post-process certain fields depending on
56    # the field, or whether we are outputting it for indexing
57}
58
59sub text {
60    my $self = shift (@_);
61    my ($doc_obj) = @_;
62    my $handle = $self->{'output_handle'};
63   
64    # only output this document if it is one to be indexed
65    return if ($doc_obj->get_doc_type() ne "indexed_doc");
66   
67    # see if this document belongs to this subcollection
68    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
69
70    # this is another document
71    $self->{'num_docs'} += 1;
72
73    # get the parameters for the output
74    my ($level, $fields) = split (/:/, $self->{'index'});
75    $fields =~ s/\ball\b/Title,Creator,text/;
76    $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
77
78    my $doc_section = 0; # just for this document
79    my $text = "";
80    my $text_extra = "";
81
82    # get the text for this document
83    my $section = $doc_obj->get_top_section();
84    while (defined $section) {
85    # update a few statistics
86    $doc_section++;
87    $self->{'num_sections'} += 1;
88
89    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
90    if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
91        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
92        foreach my $field (split (/,/, $fields)) {
93        # only deal with this field if it doesn't start with top or
94        # this is the first section
95        my $real_field = $field;
96        if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
97            my $new_text = "";
98            if ($level eq "dummy") {
99            # a dummy index is a special case used when no
100            # indexes are specified (since there must always be
101            # at least one index or we can't retrieve the
102            # compressed text) - we add a small amount of text
103            # to these dummy indexes which will never be seen
104            # but will overcome mg's problems with building
105            # empty indexes
106            $new_text = "this is dummy text to stop mg barfing";
107            $self->{'num_processed_bytes'} += length ($new_text);
108
109            } elsif ($real_field eq "text") {
110            $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
111            $self->{'num_processed_bytes'} += length ($new_text);
112            $new_text =~ s/[\cB\cC]//g;
113            $self->find_paragraphs($new_text);
114           
115            } else {
116            my $first = 1;
117            my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
118            if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
119                if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
120                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
121                }
122            }
123            foreach my $meta (@section_metadata) {
124                $meta =~ s/[\cB\cC]//g;
125                $self->{'num_processed_bytes'} += length ($meta);
126                $new_text .= "\cC" unless $first;
127                $new_text .= $meta if $self->{'store_text'};
128                $first = 0;
129            }
130            }
131           
132            # filter the text
133            $self->filter_text ($field, $new_text);
134
135            $text .= "$new_text\cC";
136        }
137        }
138    }
139   
140    if ($level eq "document") { $text_extra .= "\cB"; }
141    else { $text .= "\cB"; }
142   
143    $section = $doc_obj->get_next_section($section);
144    }
145
146    print $handle "$text$text_extra";
147}
148
1491;
150
Note: See TracBrowser for help on using the browser.