source: gsdl/trunk/perllib/mgbuildproc.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.6 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29package mgbuildproc;
30
31
32use basebuildproc;
33use strict;
34
35
36BEGIN {
37 @mgbuildproc::ISA = ('basebuildproc');
38}
39
40sub new {
41 my $class = shift @_;
42 my $self = new basebuildproc (@_);
43 return bless $self, $class;
44}
45
46
47sub find_paragraphs {
48 $_[1] =~ s/(<p\b)/\cC$1/gi;
49}
50
51sub text {
52 my $self = shift (@_);
53 my ($doc_obj) = @_;
54 my $handle = $self->{'output_handle'};
55
56 # only output this document if it is one to be indexed
57 return if ($doc_obj->get_doc_type() ne "indexed_doc");
58
59 # see if this document belongs to this subcollection
60 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
61
62 # this is another document
63 $self->{'num_docs'} += 1;
64
65 # get the parameters for the output
66 my ($level, $fields) = split (/:/, $self->{'index'});
67 $fields =~ s/\ball\b/Title,Creator,text/;
68 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
69
70 my $doc_section = 0; # just for this document
71 my $text = "";
72 my $text_extra = "";
73
74 # get the text for this document
75 my $section = $doc_obj->get_top_section();
76 while (defined $section) {
77 # update a few statistics
78 $doc_section++;
79 $self->{'num_sections'} += 1;
80
81 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
82 if (($indexed_doc) && ($indexed_section eq "indexed_section" || $indexed_section eq "indexed_doc")) {
83 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
84 foreach my $field (split (/,/, $fields)) {
85 # only deal with this field if it doesn't start with top or
86 # this is the first section
87 my $real_field = $field;
88 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
89 my $new_text = "";
90 if ($level eq "dummy") {
91 # a dummy index is a special case used when no
92 # indexes are specified (since there must always be
93 # at least one index or we can't retrieve the
94 # compressed text) - we add a small amount of text
95 # to these dummy indexes which will never be seen
96 # but will overcome mg's problems with building
97 # empty indexes
98 $new_text = "this is dummy text to stop mg barfing";
99 $self->{'num_processed_bytes'} += length ($new_text);
100
101 } elsif ($real_field eq "text") {
102 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
103 $self->{'num_processed_bytes'} += length ($new_text);
104 $new_text =~ s/[\cB\cC]//g;
105 $self->find_paragraphs($new_text);
106
107 } else {
108 my $first = 1;
109 my @section_metadata = @{$doc_obj->get_metadata ($section, $real_field)};
110 if ($level eq "section" && $section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
111 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
112 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $real_field)});
113 }
114 }
115 foreach my $meta (@section_metadata) {
116 $meta =~ s/[\cB\cC]//g;
117 $self->{'num_processed_bytes'} += length ($meta);
118 $new_text .= "\cC" unless $first;
119 $new_text .= $meta if $self->{'store_text'};
120 $first = 0;
121 }
122 }
123
124 # filter the text
125 $new_text = $self->filter_text ($field, $new_text);
126
127 $text .= "$new_text\cC";
128 }
129 }
130 }
131
132 if ($level eq "document") { $text_extra .= "\cB"; }
133 else { $text .= "\cB"; }
134
135 $section = $doc_obj->get_next_section($section);
136 }
137
138 print $handle "$text$text_extra";
139}
140
1411;
142
Note: See TracBrowser for help on using the repository browser.