source: trunk/gsdl/perllib/lucenebuildproc.pm@ 12581

Last change on this file since 12581 was 12426, checked in by mdewsnip, 18 years ago

Deleted the code for removing entities, since it seemed to be negatively helpful (and done twice in many situations). When compressing the text, htmlsafe is called on the section text, so the XML will be valid in this case. When indexing the text, the HTML tags are stripped out ('strip_html' is always set for Lucene), so there is no problem in this case either.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.3 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39sub BEGIN {
40 @lucenebuildproc::ISA = ('mgppbuildproc');
41}
42
43
44sub new {
45 my $class = shift @_;
46 my $self = new mgppbuildproc (@_);
47
48 return bless $self, $class;
49}
50
51
52sub is_incremental_capable
53{
54 my $self = shift (@_);
55
56 # Unlike MG and MGPP, Lucene supports incremental building
57 return 1;
58}
59
60
61sub text {
62 my $self = shift (@_);
63 my ($doc_obj,$file) = @_;
64 my $handle = $self->{'output_handle'};
65 my $outhandle = $self->{'outhandle'};
66
67 # only output this document if it is one to be indexed
68 return if ($doc_obj->get_doc_type() ne "indexed_doc");
69
70 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
71
72 # this is another document
73 $self->{'num_docs'} += 1;
74
75 # get the parameters for the output
76 # split on : just in case there is subcoll and lang stuff
77 my ($fields) = split (/:/, $self->{'index'});
78
79 my $doc_level = $mgppbuildproc::level_map{'document'};
80 my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
81
82 my $levels = $self->{'levels'};
83 my $ldoc_level = $levels->{'document'};
84 my $lsec_level = $levels->{'section'};
85 my $lpar_level = $levels->{'paragraph'};
86
87 my $docid="";
88 if ($ldoc_level) {
89 if ($self->{'gdbm_level'} eq 'document') {
90 my $doc_sec_num = $self->{'num_docs'};
91 $docid = "gs2:id=\"$doc_sec_num\"";
92 } else {
93 # default is section level
94 my $doc_sec_num = $self->{'num_sections'}+1;
95 $docid = "gs2:id=\"$doc_sec_num\"";
96 }
97 }
98 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
99 my $documentendtag = "\n</$doc_level>\n";
100
101 my ($sectiontag) = "";
102 if ($lsec_level) {
103 $sectiontag = $mgppbuildproc::level_map{'section'};
104 }
105 my ($parastarttag) = "";
106 my ($paraendtag) = "";
107 if ($self->{'levels'}->{'paragraph'}) {
108 if ($self->{'strip_html'}) {
109 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
110 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
111 } else {
112 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
113 }
114 }
115
116 my $doc_section = 0; # just for this document
117
118 my $text = "";
119 $text .= $documenttag;
120 # get the text for this document
121 my $section = $doc_obj->get_top_section();
122 while (defined $section) {
123 # update a few statistics
124 $doc_section++;
125 $self->{'num_sections'} += 1;
126
127 if ($sectiontag ne "") {
128 my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
129 $text .= "\n<$sectiontag $secid >\n";
130 }
131
132 # if we are doing subcollections, then some docs shouldn't be indexed.
133 # but we need to put the section tag placeholders in there so the
134 # sections match up with gdbm db
135 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
136 if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
137 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
138 $section = $doc_obj->get_next_section($section);
139 next;
140 }
141
142 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
143 foreach my $field (split (/;/, $fields)) {
144 # only deal with this field if it doesn't start with top or
145 # this is the first section
146 my $real_field = $field;
147 next if (($real_field =~ s/^top//) && ($doc_section != 1));
148
149 my $new_text = "";
150
151 # we get allfields by default - do nothing
152 if ($real_field eq "allfields") {
153
154 }
155 # metadata - output all metadata we know about except gsdl stuff
156 elsif ($real_field eq "metadata") {
157 my $shortname = "";
158 my $metadata = $doc_obj->get_all_metadata ($section);
159 foreach my $pair (@$metadata) {
160 my ($mfield, $mvalue) = (@$pair);
161 # check fields here, maybe others dont want - change to use dontindex!!
162 if ($mfield ne "Identifier"
163 && $mfield !~ /^gsdl/
164 && $mfield ne "classifytype"
165 && $mfield ne "assocfilepath"
166 && defined $mvalue && $mvalue ne "") {
167
168 if (defined $self->{'indexfieldmap'}->{$mfield}) {
169 $shortname = $self->{'indexfieldmap'}->{$mfield};
170 }
171 else {
172 $shortname = $self->create_shortname($mfield);
173 $self->{'indexfieldmap'}->{$mfield} = $shortname;
174 $self->{'indexfieldmap'}->{$shortname} = 1;
175 }
176 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
177 if (!defined $self->{'indexfields'}->{$mfield}) {
178 $self->{'indexfields'}->{$mfield} = 1;
179 }
180 }
181 }
182 }
183 else {
184 #individual metadata and or text specified - could be a comma separated list
185 my $shortname="";
186 if (defined $self->{'indexfieldmap'}->{$real_field}) {
187 $shortname = $self->{'indexfieldmap'}->{$real_field};
188 }
189 else {
190 $shortname = $self->create_shortname($real_field);
191 $self->{'indexfieldmap'}->{$real_field} = $shortname;
192 $self->{'indexfieldmap'}->{$shortname} = 1;
193 }
194
195 my @metadata_list = ();
196 foreach my $submeta (split /,/, $real_field) {
197 if ($submeta eq "text") {
198 my $section_text = $doc_obj->get_text($section);
199 if ($self->{'indexing_text'}) {
200 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
201 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
202 if ($parastarttag ne "") {
203 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
204 }
205 else {
206 # we don't want to individually tag each paragraph if not doing para indexing
207 $section_text = $self->preprocess_text($section_text, 1, "");
208 }
209 $new_text .= "$section_text</$shortname>$paraendtag\n";
210 }
211 else {
212 # leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment
213 &ghtml::htmlsafe($section_text);
214 $new_text .= $section_text;
215 }
216 }
217 else {
218 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
219 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
220 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
221 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
222 }
223 }
224 push (@metadata_list, @section_metadata);
225 }
226 }
227 foreach my $item (@metadata_list) {
228 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
229 }
230 }
231
232 # filter the text
233 $self->filter_text ($field, $new_text);
234 $self->{'num_processed_bytes'} += length ($new_text);
235 $text .= "$new_text";
236 } # foreach field
237
238 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
239
240 $section = $doc_obj->get_next_section($section);
241 } #while defined section
242 print $handle "$text\n$documentendtag";
243 #print STDOUT "$text\n$documentendtag";
244}
245
2461;
247
Note: See TracBrowser for help on using the repository browser.