source: trunk/gsdl/perllib/lucenebuildproc.pm@ 12401

Last change on this file since 12401 was 12371, checked in by mdewsnip, 18 years ago

If sections_index_document_metadata is on, top level sections no longer inherit metadata from themselves (!?).

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35
36sub BEGIN {
37 @lucenebuildproc::ISA = ('mgppbuildproc');
38}
39
40
41sub new {
42 my $class = shift @_;
43 my $self = new mgppbuildproc (@_);
44
45 return bless $self, $class;
46}
47
48
49sub is_incremental_capable
50{
51 my $self = shift (@_);
52
53 # Unlike MG and MGPP, Lucene supports incremental building
54 return 1;
55}
56
57
58sub preprocess_text {
59 my $self = shift (@_);
60 my ($text, $strip_html, $para) = @_;
61
62 # call the mgpp method first
63 my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);
64
65 # remove entities
66 $new_text =~ s/&\w{1,10};//g;
67 # remove &
68 $new_text =~ s/&//g;
69
70 return $new_text;
71}
72
73
74sub text {
75 my $self = shift (@_);
76 my ($doc_obj,$file) = @_;
77 my $handle = $self->{'output_handle'};
78 my $outhandle = $self->{'outhandle'};
79
80 # only output this document if it is one to be indexed
81 return if ($doc_obj->get_doc_type() ne "indexed_doc");
82
83 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
84
85 # this is another document
86 $self->{'num_docs'} += 1;
87
88 # get the parameters for the output
89 # split on : just in case there is subcoll and lang stuff
90 my ($fields) = split (/:/, $self->{'index'});
91
92 my $doc_level = $mgppbuildproc::level_map{'document'};
93 my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
94
95 my $levels = $self->{'levels'};
96 my $ldoc_level = $levels->{'document'};
97 my $lsec_level = $levels->{'section'};
98 my $lpar_level = $levels->{'paragraph'};
99
100 my $docid="";
101 if ($ldoc_level) {
102 if ($self->{'gdbm_level'} eq 'document') {
103 my $doc_sec_num = $self->{'num_docs'};
104 $docid = "gs2:id=\"$doc_sec_num\"";
105 } else {
106 # default is section level
107 my $doc_sec_num = $self->{'num_sections'}+1;
108 $docid = "gs2:id=\"$doc_sec_num\"";
109 }
110 }
111 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
112 my $documentendtag = "\n</$doc_level>\n";
113
114 my ($sectiontag) = "";
115 if ($lsec_level) {
116 $sectiontag = $mgppbuildproc::level_map{'section'};
117 }
118 my ($parastarttag) = "";
119 my ($paraendtag) = "";
120 if ($self->{'levels'}->{'paragraph'}) {
121 if ($self->{'strip_html'}) {
122 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
123 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
124 } else {
125 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126 }
127 }
128
129 my $doc_section = 0; # just for this document
130
131 my $text = "";
132 $text .= $documenttag;
133 # get the text for this document
134 my $section = $doc_obj->get_top_section();
135 while (defined $section) {
136 # update a few statistics
137 $doc_section++;
138 $self->{'num_sections'} += 1;
139
140 if ($sectiontag ne "") {
141 my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
142 $text .= "\n<$sectiontag $secid >\n";
143 }
144
145 # if we are doing subcollections, then some docs shouldn't be indexed.
146 # but we need to put the section tag placeholders in there so the
147 # sections match up with gdbm db
148 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
149 if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
150 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
151 $section = $doc_obj->get_next_section($section);
152 next;
153 }
154
155 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
156 foreach my $field (split (/;/, $fields)) {
157 # only deal with this field if it doesn't start with top or
158 # this is the first section
159 my $real_field = $field;
160 next if (($real_field =~ s/^top//) && ($doc_section != 1));
161
162 my $new_text = "";
163 my $tmp_text = "";
164
165 # we get allfields by default - do nothing
166 if ($real_field eq "allfields") {
167
168 }
169 # metadata - output all metadata we know about except gsdl stuff
170 elsif ($real_field eq "metadata") {
171 my $shortname = "";
172 my $metadata = $doc_obj->get_all_metadata ($section);
173 foreach $pair (@$metadata) {
174 my ($mfield, $mvalue) = (@$pair);
175 # check fields here, maybe others dont want - change to use dontindex!!
176 if ($mfield ne "Identifier"
177 && $mfield !~ /^gsdl/
178 && $mfield ne "classifytype"
179 && $mfield ne "assocfilepath"
180 && defined $mvalue && $mvalue ne "") {
181
182 if (defined $self->{'indexfieldmap'}->{$mfield}) {
183 $shortname = $self->{'indexfieldmap'}->{$mfield};
184 }
185 else {
186 $shortname = $self->create_shortname($mfield);
187 $self->{'indexfieldmap'}->{$mfield} = $shortname;
188 $self->{'indexfieldmap'}->{$shortname} = 1;
189 }
190 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
191 if (!defined $self->{'indexfields'}->{$mfield}) {
192 $self->{'indexfields'}->{$mfield} = 1;
193 }
194 }
195 }
196
197 }
198 else {
199 #individual metadata and or text specified - could be a comma separated list
200 my $shortname="";
201 if (defined $self->{'indexfieldmap'}->{$real_field}) {
202 $shortname = $self->{'indexfieldmap'}->{$real_field};
203 }
204 else {
205 $shortname = $self->create_shortname($real_field);
206 $self->{'indexfieldmap'}->{$real_field} = $shortname;
207 $self->{'indexfieldmap'}->{$shortname} = 1;
208 }
209
210 my @metadata_list = ();
211 foreach $submeta (split /,/, $real_field) {
212 if ($submeta eq "text") {
213 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
214 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
215 $tmp_text .= $doc_obj->get_text ($section);
216 if ($parastarttag ne "") {
217 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
218 } else {
219 # we don't want to individually tag each paragraph if not doing para indexing
220 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "");
221 }
222 $new_text .= "$tmp_text</$shortname>$paraendtag\n";
223 }
224 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
225 $tmp_text .= $doc_obj->get_text ($section);
226 &ghtml::htmlsafe($tmp_text);
227 $new_text .= $tmp_text;
228
229 }
230 }
231 else {
232 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
233 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
234 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
235 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
236 }
237 }
238 push (@metadata_list, @section_metadata);
239 }
240 }
241 foreach my $item (@metadata_list) {
242 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
243 }
244
245 # remove entities
246 $new_text =~ s/&\w{1,10};//g;
247 # remove &
248 $new_text =~ s/&//g;
249 }
250
251 # filter the text
252 $self->filter_text ($field, $new_text);
253 $self->{'num_processed_bytes'} += length ($new_text);
254 $text .= "$new_text";
255 } # foreach field
256
257 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
258
259 $section = $doc_obj->get_next_section($section);
260 } #while defined section
261 print $handle "$text\n$documentendtag";
262 #print STDOUT "$text\n$documentendtag";
263}
264
2651;
266
Note: See TracBrowser for help on using the repository browser.