source: trunk/gsdl/perllib/mgbuildproc.pm@ 12371

Last change on this file since 12371 was 12371, checked in by mdewsnip, 18 years ago

If sections_index_document_metadata is on, top level sections no longer inherit metadata from themselves (!?).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.9 KB
RevLine 
[537]1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[17]26# This document processor outputs a document
27# for mg to process
[4]28
29
30package mgbuildproc;
31
[9919]32use basebuildproc;
[3767]33
[4]34BEGIN {
[9919]35 @mgbuildproc::ISA = ('basebuildproc');
[4]36}
37
38sub new {
[9919]39 my $class = shift @_;
40 my $self = new basebuildproc (@_);
[4]41 return bless $self, $class;
42}
43
44
[289]45sub find_paragraphs {
46 $_[1] =~ s/(<p\b)/\cC$1/gi;
47}
48
[292]49sub filter_text {
50 # $self->filter_text ($field, $new_text);
51 # don't want to do anything for this version, however,
52 # in a particular collection you might want to override
53 # this method to post-process certain fields depending on
54 # the field, or whether we are outputting it for indexing
55}
56
[4]57sub text {
58 my $self = shift (@_);
59 my ($doc_obj) = @_;
60 my $handle = $self->{'output_handle'};
[9919]61
[4]62 # only output this document if it is one to be indexed
63 return if ($doc_obj->get_doc_type() ne "indexed_doc");
[9919]64
[69]65 # see if this document belongs to this subcollection
[9919]66 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
[69]67
[4]68 # this is another document
69 $self->{'num_docs'} += 1;
70
71 # get the parameters for the output
72 my ($level, $fields) = split (/:/, $self->{'index'});
73 $fields =~ s/\ball\b/Title,Creator,text/;
74 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
75
76 my $doc_section = 0; # just for this document
77 my $text = "";
78 my $text_extra = "";
79
80 # get the text for this document
81 my $section = $doc_obj->get_top_section();
82 while (defined $section) {
83 # update a few statistics
84 $doc_section++;
85 $self->{'num_sections'} += 1;
[69]86
[12274]87 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
[12356]88 if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
[69]89 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[900]90 foreach my $field (split (/,/, $fields)) {
[69]91 # only deal with this field if it doesn't start with top or
92 # this is the first section
93 my $real_field = $field;
94 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
95 my $new_text = "";
[4743]96 if ($level eq "dummy") {
97 # a dummy index is a special case used when no
98 # indexes are specified (since there must always be
99 # at least one index or we can't retrieve the
100 # compressed text) - we add a small amount of text
101 # to these dummy indexes which will never be seen
102 # but will overcome mg's problems with building
103 # empty indexes
104 $new_text = "this is dummy text to stop mg barfing";
105 $self->{'num_processed_bytes'} += length ($new_text);
106
107 } elsif ($real_field eq "text") {
[2336]108 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
[1251]109 $self->{'num_processed_bytes'} += length ($new_text);
[69]110 $new_text =~ s/[\cB\cC]//g;
[289]111 $self->find_paragraphs($new_text);
[69]112
113 } else {
[1072]114 my $first = 1;
[10474]115 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
[12371]116 if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
[10474]117 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
118 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
119 }
120 }
121 foreach my $meta (@section_metadata) {
[1072]122 $meta =~ s/[\cB\cC]//g;
[1251]123 $self->{'num_processed_bytes'} += length ($meta);
[1072]124 $new_text .= "\cC" unless $first;
[2336]125 $new_text .= $meta if $self->{'store_text'};
[1072]126 $first = 0;
127 }
[69]128 }
[4]129
[292]130 # filter the text
131 $self->filter_text ($field, $new_text);
132
[69]133 $text .= "$new_text\cC";
[4]134 }
135 }
136 }
137
138 if ($level eq "document") { $text_extra .= "\cB"; }
139 else { $text .= "\cB"; }
140
141 $section = $doc_obj->get_next_section($section);
142 }
143
144 print $handle "$text$text_extra";
145}
146
1471;
148
Note: See TracBrowser for help on using the repository browser.