Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 27329

Last change on this file since 27329 was 27329, checked in by kjdon, 11 years ago
changes to match mgpp's shortname mapping stuff. Changed some util functions to FileUtil functions
Property svn:keywords set to `Author Date Id Revision`
File size: 17.8 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub textedit {
66	my $self = shift (@_);
67	my ($doc_obj,$file,$edit_mode) = @_;
68
69	my $lucenehandle = $self->{'output_handle'};
70	my $outhandle = $self->{'outhandle'};
71
72	# only output this document if it is one to be indexed
73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75	# skip this document if in "compress-text" mode and asked to delete it
76	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80	# this is another document
81	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
82	$self->{'num_docs'} += 1;
83	}
84	else {
85	$self->{'num_docs'} -= 1;
86	}
87
88	# get the parameters for the output
89	# split on : just in case there is subcoll and lang stuff
90	my ($fields) = split (/:/, $self->{'index'});
91
92	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94	my $levels = $self->{'levels'};
95	my $ldoc_level = $levels->{'document'};
96	my $lsec_level = $levels->{'section'};
97
98	my $gs2_docOID = $doc_obj->get_OID();
99	my $documenttag = undef;
100	my $documentendtag = undef;
101
102	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
103	$documentendtag = "\n</$doc_tag_name>\n";
104
105	my $sec_tag_name = "";
106	if ($lsec_level)
107	{
108	$sec_tag_name = $mgppbuildproc::level_map{'section'};
109	}
110
111	my $doc_section = 0; # just for this document
112
113	my $text = "";
114	$text .= $documenttag;
115	# get the text for this document
116	my $section = $doc_obj->get_top_section();
117	while (defined $section)
118	{
119	# update a few statistics
120	$doc_section++;
121	$self->{'num_sections'}++;
122
123	my $sec_gs2_id = $self->{'num_sections'};
124	my $sec_gs2_docOID = $gs2_docOID;
125	$sec_gs2_docOID .= ".$section" if ($section ne "");
126
127	# if we are doing subcollections, then some docs shouldn't be indexed.
128	# but we need to put the section tag placeholders in there so the
129	# sections match up with database
130	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
131	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
132	if ($sec_tag_name ne "") {
133	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
134	$text .= "\n</$sec_tag_name>\n"
135	}
136	$section = $doc_obj->get_next_section($section);
137	next;
138	}
139
140	if ($sec_tag_name ne "")
141	{
142	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
143	}
144
145	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
146	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
147	}
148	else {
149	# delete
150	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
151	}
152
153
154	# has the user added a 'metadata' index?
155	my $all_metadata_specified = 0;
156	# which fields have already been indexed? (same as fields, but in a map)
157	my $specified_fields = {};
158
159	# do we have an allfields index??
160	my $allfields_index = 0;
161	# collect up all the text for it in here
162	my $allfields_text = "";
163	foreach my $field (split (/;/, $fields)) {
164	if ($field eq "allfields") {
165	$allfields_index = 1;
166	} elsif ($field eq "metadata") {
167	$all_metadata_specified = 1;
168	}
169	}
170
171	foreach my $field (split (/;/, $fields)) {
172
173	# only deal with this field if it doesn't start with top or
174	# this is the first section
175	my $real_field = $field;
176	next if (($real_field =~ s/^top//) && ($doc_section != 1));
177
178	# process these two later
179	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
180
181	#individual metadata and or text specified - could be a comma separated list
182	$specified_fields->{$real_field} = 1;
183	my $shortname="";
184	my $new_field = 0; # have we found a new field name?
185	if (defined $self->{'fieldnamemap'}->{$real_field}) {
186	$shortname = $self->{'fieldnamemap'}->{$real_field};
187	} else {
188	$shortname = $self->create_shortname($real_field);
189	$self->{'fieldnamemap'}->{$real_field} = $shortname;
190	$self->{'fieldnamemap'}->{$shortname} = 1;
191	}
192	my @metadata_list = (); # put any metadata values in here
193	my $section_text = ""; # put the text in here
194	foreach my $submeta (split /,/, $real_field) {
195	if ($submeta eq "text") {
196	# no point in indexing text more than once
197	if ($section_text eq "") {
198	$section_text = $doc_obj->get_text($section);
199	if ($self->{'indexing_text'}) {
200	# we always strip html
201	$section_text = $self->preprocess_text($section_text, 1, "");
202	}
203	else {
204	# leave html stuff in, but escape the tags
205	&ghtml::htmlsafe($section_text);
206	}
207	}
208	}
209	else {
210	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
211
212	# its a metadata element
213	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
214	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
215	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
216	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
217	}
218	}
219	push (@metadata_list, @section_metadata);
220	}
221	} # for each field in this one index
222
223
224	# now we add the text and/or metadata into new_text
225	if ($section_text ne "" \|\| scalar(@metadata_list)) {
226	my $new_text = "";
227
228	if ($section_text ne "") {
229	$new_text .= "$section_text ";
230	}
231
232	foreach my $item (@metadata_list) {
233	&ghtml::htmlsafe($item);
234	$new_text .= "$item ";
235	}
236
237	if ($allfields_index) {
238	$allfields_text .= $new_text;
239	}
240
241	if ($self->{'indexing_text'}) {
242	# add the tag
243	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
244	$self->{'allindexfields'}->{$real_field} = 1;
245	}
246	# filter the text
247	$new_text = $self->filter_text ($field, $new_text);
248
249	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
250	$self->{'num_processed_bytes'} += length ($new_text);
251	$text .= "$new_text";
252	}
253	else {
254	# delete
255	$self->{'num_processed_bytes'} -= length ($new_text);
256	}
257	}
258
259	} # foreach field
260
261	if ($all_metadata_specified) {
262
263	my $new_text = "";
264	my $shortname = "";
265	my $metadata = $doc_obj->get_all_metadata ($section);
266	foreach my $pair (@$metadata) {
267	my ($mfield, $mvalue) = (@$pair);
268	# no value
269	next unless defined $mvalue && $mvalue ne "";
270	# we have already indexed this
271	next if defined ($specified_fields->{$mfield});
272	# check fields here, maybe others dont want - change to use dontindex!!
273	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
274	next if ($mfield =~ /^gsdl/);
275
276	&ghtml::htmlsafe($mvalue);
277
278	if (defined $self->{'fieldnamemap'}->{$mfield}) {
279	$shortname = $self->{'fieldnamemap'}->{$mfield};
280	}
281	else {
282	$shortname = $self->create_shortname($mfield);
283	$self->{'fieldnamemap'}->{$mfield} = $shortname;
284	$self->{'fieldnamemap'}->{$shortname} = 1;
285	}
286	$self->{'allindexfields'}->{$mfield} = 1;
287	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
288	if ($allfields_index) {
289	$allfields_text .= "$mvalue ";
290	}
291
292	if (!defined $self->{'extraindexfields'}->{$mfield}) {
293	$self->{'extraindexfields'}->{$mfield} = 1;
294	}
295
296	}
297	# filter the text
298	$new_text = $self->filter_text ("metadata", $new_text);
299
300	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
301	$self->{'num_processed_bytes'} += length ($new_text);
302	$text .= "$new_text";
303	}
304	else {
305	# delete
306	$self->{'num_processed_bytes'} -= length ($new_text);
307	}
308	}
309
310	if ($allfields_index) {
311
312	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
313	# filter the text
314	$new_text = $self->filter_text ("allfields", $new_text);
315
316	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
317	$self->{'num_processed_bytes'} += length ($new_text);
318	$text .= "$new_text";
319	}
320	else {
321	# delete
322	$self->{'num_processed_bytes'} -= length ($new_text);
323	}
324	}
325
326	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
327
328	$section = $doc_obj->get_next_section($section);
329	} # while defined section
330
331	print $lucenehandle "$text\n$documentendtag";
332
333	## if ($edit_mode eq "delete") {
334	## print STDERR "$text\n$documentendtag";
335	## }
336
337	}
338
339	sub text {
340	my $self = shift (@_);
341	my ($doc_obj,$file) = @_;
342
343	$self->textedit($doc_obj,$file,"add");
344	}
345
346	sub textreindex
347	{
348	my $self = shift (@_);
349	my ($doc_obj,$file) = @_;
350
351	$self->textedit($doc_obj,$file,"update");
352	}
353
354	sub textdelete
355	{
356	my $self = shift (@_);
357	my ($doc_obj,$file) = @_;
358
359	$self->textedit($doc_obj,$file,"delete");
360	}
361
362
363
364
365
366	# /** We make this builder pretend to be a document processor so we can get
367	# * information back from the plugins.
368	# *
369	# * @param $self A reference to this Lucene builder
370	# * @param $doc_obj A reference to a document object representing what was
371	# * parsed by the GAPlug
372	# * @param $file The name of the file parsed as a string
373	# *
374	# * @author John Thompson, DL Consulting Ltd
375	# */
376	sub process()
377	{
378	my $self = shift (@_);
379	my ($doc_obj, $file) = @_;
380
381	# If this is called from any stage other than an incremental infodb we want
382	# to pass through to the superclass of build
383	if ($self->get_mode() eq "incinfodb")
384	{
385	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
386	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
387	$archivedir = "" unless defined $archivedir;
388	$archivedir =~ s/\\/\//g;
389	$archivedir =~ s/^\/+//;
390	$archivedir =~ s/\/+$//;
391
392	# Number of files
393	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
394
395	# resolve the final filenames of the files associated with this document
396	$self->assoc_files ($doc_obj, $archivedir);
397
398	# is this a paged or a hierarchical document
399	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
400
401	# Determine the actual docnum by checking if we've processed any
402	# previous incrementally added documents. If so, carry on from there.
403	# Otherwise we set the counter to be the same as the number of
404	# sections encountered during the previous build
405	if ($self->{'numincdocs'} == 0)
406	{
407	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
408	}
409
410	my $section = $doc_obj->get_top_section ();
411	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
412	my $doc_OID = $doc_obj->get_OID();
413	my $url = "";
414	while (defined $section)
415	{
416	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
417	# Attach all the other metadata to this document
418	# output the fact that this document is a document (unless doctype
419	# has been set to something else from within a plugin
420	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
421	if (!defined $dtype \|\| $dtype !~ /\w/)
422	{
423	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
424	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
425	}
426	# output whether this node contains text
427	if ($doc_obj->get_text_length($section) > 0)
428	{
429	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
430	}
431	else
432	{
433	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
434	}
435
436	# output archivedir if at top level
437	if ($section eq $doc_obj->get_top_section())
438	{
439	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
440	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
441	}
442
443	# output a list of children
444	my $children = $doc_obj->get_children ($section);
445	if (scalar(@$children) > 0)
446	{
447	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
448	my @contains = ();
449	foreach my $child (@$children)
450	{
451	if ($child =~ /^.*?\.(\d+)$/)
452	{
453	push (@contains, "\".$1");
454	}
455	else
456	{
457	push (@contains, "\".$child");
458	}
459	}
460	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
461	}
462	#output the matching doc number
463	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
464	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
465
466	$self->{'numincdocs'}++;
467	$section = $doc_obj->get_next_section($section);
468	# if no sections wanted, only add the docs
469	last if ($self->{'db_level'} eq "document");
470	}
471	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
472	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
473	}
474	else
475	{
476	$self->mgppbuildproc::process(@_);
477	}
478	}
479	# / process() /
480
481
482	# Following methods seem to be no different to those defined in basebuildproc.pm
483	# From inspection, it looks like these ones can be removed
484
485
486	sub get_num_docs {
487	my $self = shift (@_);
488	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
489	return $self->{'num_docs'};
490	}
491
492	sub get_num_sections {
493	my $self = shift (@_);
494	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
495	return $self->{'num_sections'};
496	}
497
498	# num_bytes is the actual number of bytes in the collection
499	# this is normally the same as what's processed during text compression
500	sub get_num_bytes {
501	my $self = shift (@_);
502	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
503	return $self->{'num_bytes'};
504	}
505
506
507	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
508	# Otherwise the removal of tags below might lead to Lucene turning
509	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
510	# (example from demo collection b20cre)
511	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
512	sub preprocess_text
513	{
514	my $self = shift (@_);
515	my ($text, $strip_html, $para) = @_;
516	# at this stage, we do not do paragraph tags unless have strip_html -
517	# it will result in a huge mess of non-xml
518	return unless $strip_html;
519
520	my $new_text = $text;
521
522	# if we have <pre> tags, we can have < > inside them, need to delete
523	# the <> before stripping tags
524	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
525
526	if ($para eq "") {
527	# just remove all tags
528	$new_text =~ s/<[^>]*>/ /gs;
529	} else {
530	# strip all tags except <p> tags which get turned into $para
531	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
532	}
533
534	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
535	# may not be valid XML (eg. if HTML-only entities like   are used)
536	$new_text =~ s/&\w{1,10};//g;
537	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
538	$new_text =~ s/&([^\#])/ $1/g;
539
540	return $new_text;
541	}
542
543	sub delete_assoc_files
544	{
545	my $self = shift (@_);
546	my ($archivedir, $edit_mode) = @_;
547
548	$self->basebuildproc::delete_assoc_files(@_);
549
550	if ($edit_mode eq "delete") {
551	# if we are deleting the doc, then also delete the lucene text version
552	my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
553	if (-d $assoc_dir) {
554	&util::rm_r($assoc_dir);
555	}
556	}
557	}
558	1;
559
560

Note: See TracBrowser for help on using the repository browser.

Download in other formats: