source: gsdl/trunk/perllib/lucenebuildproc.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago

Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42 @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47 my $class = shift @_;
48 my $self = new mgppbuildproc (@_);
49
50 $self->{'numincdocs'} = 0;
51
52 return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58 my $self = shift (@_);
59
60 # Unlike MG and MGPP, Lucene supports incremental building
61 return 1;
62}
63
64
65sub text {
66 my $self = shift (@_);
67 my ($doc_obj,$file) = @_;
68 my $handle = $self->{'output_handle'};
69 my $outhandle = $self->{'outhandle'};
70
71 # only output this document if it is one to be indexed
72 return if ($doc_obj->get_doc_type() ne "indexed_doc");
73
74 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
75
76 # this is another document
77 $self->{'num_docs'} += 1;
78
79 # get the parameters for the output
80 # split on : just in case there is subcoll and lang stuff
81 my ($fields) = split (/:/, $self->{'index'});
82
83 my $doc_level = $mgppbuildproc::level_map{'document'};
84 my $gs2ns = 'xmlns:gs2="http://www.greenstone.org/gs2"';
85
86 my $levels = $self->{'levels'};
87 my $ldoc_level = $levels->{'document'};
88 my $lsec_level = $levels->{'section'};
89 my $lpar_level = $levels->{'paragraph'};
90
91 my $docid="";
92 if ($ldoc_level)
93 {
94 if ($self->{'gdbm_level'} eq 'document')
95 {
96 my $doc_sec_num = $self->{'num_docs'};
97 $docid = "gs2:id=\"$doc_sec_num\"";
98 }
99 else
100 {
101 # default is section level
102 my $doc_sec_num = $self->{'num_sections'} + 1;
103 $docid = "gs2:id=\"$doc_sec_num\"";
104 }
105 }
106 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n";
107 my $documentendtag = "\n</$doc_level>\n";
108
109 my ($sectiontag) = "";
110 if ($lsec_level)
111 {
112 $sectiontag = $mgppbuildproc::level_map{'section'};
113 }
114 my ($parastarttag) = "";
115 my ($paraendtag) = "";
116 if ($self->{'levels'}->{'paragraph'})
117 {
118 if ($self->{'strip_html'})
119 {
120 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
121 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
122 }
123 else
124 {
125 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
126 }
127 }
128
129 my $doc_section = 0; # just for this document
130
131 my $text = "";
132 $text .= $documenttag;
133 # get the text for this document
134 my $section = $doc_obj->get_top_section();
135 while (defined $section)
136 {
137 # update a few statistics
138 $doc_section++;
139 $self->{'num_sections'}++;
140
141 if ($sectiontag ne "")
142 {
143 my $secid = "gs2:id=\"".$self->{'num_sections'}."\"";
144 $text .= "\n<$sectiontag $secid >\n";
145 }
146
147 # if we are doing subcollections, then some docs shouldn't be indexed.
148 # but we need to put the section tag placeholders in there so the
149 # sections match up with gdbm db
150 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
151 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
152 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
153 $section = $doc_obj->get_next_section($section);
154 next;
155 }
156
157 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
158 foreach my $field (split (/;/, $fields))
159 {
160 # only deal with this field if it doesn't start with top or
161 # this is the first section
162 my $real_field = $field;
163 next if (($real_field =~ s/^top//) && ($doc_section != 1));
164
165 my $new_text = "";
166 my $tmp_text = "";
167
168 # If allfields is requested add all metadata fields and text as
169 # belonging to the ZZ field
170 if ($real_field eq "allfields") {
171 # Text first - no html nor paragraph tags
172 $new_text .= "$parastarttag<ZZ index=\"1\">\n";
173 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
174 &ghtml::htmlsafe($tmp_text);
175 $new_text .= "$tmp_text</ZZ>$paraendtag\n";
176 # Then Metadata
177 my $metadata = $doc_obj->get_all_metadata ($section);
178 foreach my $pair (@$metadata) {
179 my ($mfield, $mvalue) = (@$pair);
180 &ghtml::htmlsafe($mvalue);
181 # check fields here, maybe others dont want - change to use dontindex!!
182 if ($mfield ne "Identifier"
183 && $mfield !~ /^gsdl/
184 && $mfield ne "classifytype"
185 && $mfield ne "assocfilepath"
186 && defined $mvalue && $mvalue ne "") {
187 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
188 }
189 if (!defined $self->{'indexfields'}->{$mfield}) {
190 $self->{'indexfields'}->{$mfield} = 1;
191 }
192 }
193 }
194 # metadata - output all metadata we know about except gsdl stuff
195 elsif ($real_field eq "metadata" || $real_field eq "allfields") {
196 my $shortname = "";
197 my $metadata = $doc_obj->get_all_metadata ($section);
198 foreach my $pair (@$metadata) {
199 my ($mfield, $mvalue) = (@$pair);
200 &ghtml::htmlsafe($mvalue);
201 # check fields here, maybe others dont want - change to use dontindex!!
202 if ($mfield ne "Identifier"
203 && $mfield !~ /^gsdl/
204 && $mfield ne "classifytype"
205 && $mfield ne "assocfilepath"
206 && defined $mvalue && $mvalue ne "") {
207
208 if (defined $self->{'indexfieldmap'}->{$mfield}) {
209 $shortname = $self->{'indexfieldmap'}->{$mfield};
210 }
211 else {
212 $shortname = $self->create_shortname($mfield);
213 $self->{'indexfieldmap'}->{$mfield} = $shortname;
214 $self->{'indexfieldmap'}->{$shortname} = 1;
215 }
216 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
217 if (!defined $self->{'indexfields'}->{$mfield}) {
218 $self->{'indexfields'}->{$mfield} = 1;
219 }
220 }
221 }
222 }
223 else {
224 #individual metadata and or text specified - could be a comma separated list
225 my $shortname="";
226 if (defined $self->{'indexfieldmap'}->{$real_field}) {
227 $shortname = $self->{'indexfieldmap'}->{$real_field};
228 }
229 else {
230 $shortname = $self->create_shortname($real_field);
231 $self->{'indexfieldmap'}->{$real_field} = $shortname;
232 $self->{'indexfieldmap'}->{$shortname} = 1;
233 }
234
235 my @metadata_list = ();
236 foreach my $submeta (split /,/, $real_field) {
237 if ($submeta eq "text") {
238 my $section_text = $doc_obj->get_text($section);
239 if ($self->{'indexing_text'}) {
240 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
241 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
242 if ($parastarttag ne "") {
243 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
244 }
245 else {
246 # we don't want to individually tag each paragraph if not doing para indexing
247 $section_text = $self->preprocess_text($section_text, 1, "");
248 }
249 $new_text .= "$section_text</$shortname>$paraendtag\n";
250 }
251 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
252 $tmp_text .= $doc_obj->get_text ($section);
253 &ghtml::htmlsafe($tmp_text);
254 $new_text .= $tmp_text;
255 }
256 }
257 else {
258 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
259 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
260 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
261 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
262 }
263 }
264 push (@metadata_list, @section_metadata);
265 }
266 }
267 foreach my $item (@metadata_list) {
268 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
269 }
270 }
271 # filter the text
272 $self->filter_text ($field, $new_text);
273 $self->{'num_processed_bytes'} += length ($new_text);
274
275 $text .= "$new_text";
276 } # foreach field
277
278 $text .= "\n</$sectiontag>\n" if ($sectiontag ne "");
279
280 $section = $doc_obj->get_next_section($section);
281 } #while defined section
282 print $handle "$text\n$documentendtag";
283 #print STDOUT "$text\n$documentendtag";
284}
285
286# /** We make this builder pretend to be a document processor so we can get
287# * information back from the plugins.
288# *
289# * @param $self A reference to this Lucene builder
290# * @param $doc_obj A reference to a document object representing what was
291# * parsed by the GAPlug
292# * @param $file The name of the file parsed as a string
293# *
294# * @author John Thompson, DL Consulting Ltd
295# */
296sub process()
297 {
298 my $self = shift (@_);
299 my ($doc_obj, $file) = @_;
300
301 # If this is called from any stage other than an incremental infodb we want
302 # to pass through to the superclass of build
303 if ($self->get_mode() eq "incinfodb")
304 {
305 print STDERR "*** Processing a document added using INCINFODB ***\n";
306 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
307 $archivedir = "" unless defined $archivedir;
308 $archivedir =~ s/\\/\//g;
309 $archivedir =~ s/^\/+//;
310 $archivedir =~ s/\/+$//;
311
312 # Number of files
313 print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n";
314
315 # resolve the final filenames of the files associated with this document
316 $self->assoc_files ($doc_obj, $archivedir);
317
318 # is this a paged or a hierarchical document
319 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
320
321 # Determine the actual docnum by checking if we've processed any
322 # previous incrementally added documents. If so, carry on from there.
323 # Otherwise we set the counter to be the same as the number of
324 # sections encountered during the previous build
325 if ($self->{'numincdocs'} == 0)
326 {
327 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
328 }
329
330 my $section = $doc_obj->get_top_section ();
331 print STDERR "+ top section: '$section'\n";
332 my $doc_OID = $doc_obj->get_OID();
333 my $url = "";
334 while (defined $section)
335 {
336 print STDERR "+ processing section: '$section'\n";
337 # Attach all the other metadata to this document
338 # output the fact that this document is a document (unless doctype
339 # has been set to something else from within a plugin
340 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
341 if (!defined $dtype || $dtype !~ /\w/)
342 {
343 $doc_obj->add_utf8_metadata($section, "doctype", $dtype);
344 }
345 # output whether this node contains text
346 if ($doc_obj->get_text_length($section) > 0)
347 {
348 $doc_obj->add_utf8_metadata($section, "hastxt", 1);
349 }
350 else
351 {
352 $doc_obj->add_utf8_metadata($section, "hastxt", 0);
353 }
354
355 # output archivedir if at top level
356 if ($section eq $doc_obj->get_top_section())
357 {
358 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
359 $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
360 }
361
362 # output a list of children
363 my $children = $doc_obj->get_children ($section);
364 if (scalar(@$children) > 0)
365 {
366 $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
367 my @contains = ();
368 foreach my $child (@$children)
369 {
370 if ($child =~ /^.*?\.(\d+)$/)
371 {
372 push (@contains, "\".$1");
373 }
374 else
375 {
376 push (@contains, "\".$child");
377 }
378 }
379 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
380 }
381 #output the matching doc number
382 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n";
383 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
384
385 $self->{'numincdocs'}++;
386 $section = $doc_obj->get_next_section($section);
387 # if no sections wanted, only gdbm the docs
388 last if ($self->{'gdbm_level'} eq "document");
389 }
390 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n";
391 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section());
392 }
393 else
394 {
395 $self->mgppbuildproc::process(@_);
396 }
397 }
398# /** process() **/
399
400
401# Following methods seem to be no different to those defined in basebuildproc.pm
402# From inspection, it looks like these ones can be removed
403
404
405sub get_num_docs {
406 my $self = shift (@_);
407 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
408 return $self->{'num_docs'};
409}
410
411sub get_num_sections {
412 my $self = shift (@_);
413 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
414 return $self->{'num_sections'};
415}
416
417# num_bytes is the actual number of bytes in the collection
418# this is normally the same as what's processed during text compression
419sub get_num_bytes {
420 my $self = shift (@_);
421 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
422 return $self->{'num_bytes'};
423}
424
425
426# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
427# Otherwise the removal of tags below might lead to Lucene turning
428# "...farming</p>\n<p>EDWARD.." into "farmingedward"
429# (example from demo collection b20cre)
430# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
431sub preprocess_text
432{
433 my $self = shift (@_);
434 my ($text, $strip_html, $para) = @_;
435 # at this stage, we do not do paragraph tags unless have strip_html -
436 # it will result in a huge mess of non-xml
437 return unless $strip_html;
438
439 my $new_text = $text;
440
441 # if we have <pre> tags, we can have < > inside them, need to delete
442 # the <> before stripping tags
443 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
444
445 if ($para eq "") {
446 # just remove all tags
447 $new_text =~ s/<[^>]*>/ /gs;
448 } else {
449 # strip all tags except <p> tags which get turned into $para
450 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
451 }
452
453 # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
454 # may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
455 $new_text =~ s/&\w{1,10};//g;
456 # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
457 $new_text =~ s/&([^\#])/ $1/g;
458
459 return $new_text;
460}
461
462
4631;
464
Note: See TracBrowser for help on using the repository browser.