source: main/tags/2.52/gsdl/perllib/lucenebuildproc.pm@ 25422

Last change on this file since 25422 was 8072, checked in by davidb, 20 years ago

Support for building collections with lucene.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34
35
36sub BEGIN {
37 @ISA = ('mgppbuildproc');
38}
39
40
41sub new {
42 my $class = shift @_;
43 my $self = new mgppbuildproc (@_);
44
45 return bless $self, $class;
46}
47
48sub text {
49 my $self = shift (@_);
50 my ($doc_obj,$file) = @_;
51 my $handle = $self->{'output_handle'};
52 my $outhandle = $self->{'outhandle'};
53 my $indexed_doc = 1;
54
55 # only output this document if it is one to be indexed
56 return if ($doc_obj->get_doc_type() ne "indexed_doc");
57
58 # see if this document belongs to this subcollection
59 foreach my $indexexp (@{$self->{'indexexparr'}}) {
60 $indexed_doc = 0;
61 my ($field, $exp, $options) = split /\//, $indexexp;
62 if (defined ($field) && defined ($exp)) {
63 my ($bool) = $field =~ /^(.)/;
64 $field =~ s/^.// if $bool eq '!';
65 if ($field =~ /^filename$/i) {
66 $field = $doc_obj->get_source_filename();
67 } else {
68 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
69 }
70 next unless defined $field;
71 if ($bool eq '!') {
72 if ($options =~ /^i$/i) {
73 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
74 } else {
75 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
76 }
77 } else {
78 if ($options =~ /^i$/i) {
79 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
80 } else {
81 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
82 }
83 }
84 }
85 }
86
87 # this is another document
88 $self->{'num_docs'} += 1;
89
90 # get the parameters for the output
91 # split on : just in case there is subcoll and lang stuff
92 my ($fields) = split (/:/, $self->{'index'});
93
94 my $doc_level = $mgppbuildproc::level_map{'document'};
95 my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
96
97 my $levels = $self->{'levels'};
98 my $ldoc_level = $levels->{'document'};
99 my $lsec_level = $levels->{'section'};
100 my $lpar_level = $levels->{'paragraph'};
101
102 my $doc_idx_att = ($ldoc_level) ? "index=\"1\"" : "";
103 my $sec_idx_att = ($lsec_level) ? "index=\"1\"" : "";
104 my $par_idx_att = ($lpar_level) ? "index=\"1\"" : "";
105
106 my $docid="";
107 if ($ldoc_level) {
108 my $doc_sec_num = $self->{'num_sections'}+1;
109 $docid = "gs2:id=\"$doc_sec_num\"";
110 }
111
112 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
113 my $documentendtag = "\n</$doc_level>\n";
114
115 my ($sectiontag) = "";
116 if ($lsec_level) {
117 $sectiontag = $mgppbuildproc::level_map{'section'};
118 }
119 my ($paratag) = "";
120 if ($self->{'levels'}->{'paragraph'}) {
121 if ($self->{'strip_html'}) {
122 $paratag = "<". $mgppbuildproc::level_map{'paragraph'} . ">";
123 } else {
124 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
125 }
126 }
127
128 my $doc_section = 0; # just for this document
129
130## my $text = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'; ## ****
131 my $text = "";
132 $text .= $documenttag;
133
134 # get the text for this document
135 my $section = $doc_obj->get_top_section();
136 while (defined $section) {
137 # update a few statistics
138 $doc_section++;
139 $self->{'num_sections'} += 1;
140
141 if ($sectiontag ne "") {
142 my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
143 $text .= "\n<$sectiontag $secid >\n";
144 }
145
146 if ($indexed_doc) {
147 if ($self->{'indexing_text'}) {
148 $text .= "$paratag"; # only add para tags for indexing
149 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
150 }
151 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
152 foreach my $field (split (/,/, $fields)) {
153 # only deal with this field if it doesn't start with top or
154 # this is the first section
155 my $real_field = $field;
156 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
157 my $new_text = "";
158 my $tmp_text = "";
159 if ($real_field eq "text") {
160 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
161 $new_text .= "$paratag<TX index=\"1\">\n";
162 $tmp_text .= $doc_obj->get_text ($section);
163 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX index=\"1\">");
164
165 $new_text .= "$tmp_text</TX>\n";
166 #if (!defined $self->{'indexfields'}->{'TextOnly'}) {
167 #$self->{'indexfields'}->{'TextOnly'} = 1;
168 #}
169 }
170 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
171 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
172 }
173 } else { # metadata field
174 if ($real_field eq "allfields") { #ignore
175 }
176 elsif ($real_field eq "metadata") { # insert all metadata
177 #except gsdl stuff
178 my $shortname = "";
179 my $metadata = $doc_obj->get_all_metadata ($section);
180 foreach $pair (@$metadata) {
181 my ($mfield, $mvalue) = (@$pair);
182 # check fields here, maybe others dont want - change to use dontindex!!
183 if ($mfield ne "Identifier"
184 && $mfield !~ /^gsdl/
185 && $mfield ne "classifytype"
186 && $mfield ne "assocfilepath"
187 && defined $mvalue && $mvalue ne "") {
188
189 if (defined $self->{'indexfieldmap'}->{$mfield}) {
190 $shortname = $self->{'indexfieldmap'}->{$mfield};
191 }
192 else {
193 $shortname = $self->create_shortname($mfield);
194 $self->{'indexfieldmap'}->{$mfield} = $shortname;
195 $self->{'indexfieldmap'}->{$shortname} = 1;
196 }
197 $new_text .= "$paratag<$shortname index=\"1\">$mvalue</$shortname>\n";
198 if (!defined $self->{'indexfields'}->{$mfield}) {
199 $self->{'indexfields'}->{$mfield} = 1;
200 }
201 }
202 }
203
204 }
205 else { #individual metadata specified
206 my $shortname="";
207 #if (!defined $self->{'indexfields'}->{$real_field}) {
208 #$self->{'indexfields'}->{$real_field} = 1;
209 #}
210 if (defined $self->{'indexfieldmap'}->{$real_field}) {
211 $shortname = $self->{'indexfieldmap'}->{$real_field};
212 }
213 else {
214 $shortname = $self->create_shortname($real_field);
215 $self->{'indexfieldmap'}->{$real_field} = $shortname;
216 $self->{'indexfieldmap'}->{$shortname} = 1;
217 }
218 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
219 $new_text .= "$paratag<$shortname index=\"1\">$item</$shortname>\n";
220 }
221 }
222
223 }
224
225 # filter the text
226 $self->filter_text ($field, $new_text);
227
228 $self->{'num_processed_bytes'} += length ($new_text);
229 $text .= "$new_text";
230 }
231 }
232 } # if (indexed_doc)
233
234 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
235
236 $section = $doc_obj->get_next_section($section);
237 } #while defined section
238 print $handle "$text\n$documentendtag";
239
240}
241
2421;
243
Note: See TracBrowser for help on using the repository browser.