Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 23181

Last change on this file since 23181 was 23181, checked in by sjm84, 14 years ago
The end brace of the delete_assoc_files sub in the lucenebuildproc.pm file was accidentally left off in the previous commit
Property svn:keywords set to `Author Date Id Revision`
File size: 18.4 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51
52	return bless $self, $class;
53	}
54
55
56	sub is_incremental_capable
57	{
58	my $self = shift (@_);
59
60	# Unlike MG and MGPP, Lucene supports incremental building
61	return 1;
62	}
63
64
65	sub textedit {
66	my $self = shift (@_);
67	my ($doc_obj,$file,$edit_mode) = @_;
68
69	my $lucenehandle = $self->{'output_handle'};
70	my $outhandle = $self->{'outhandle'};
71
72	# only output this document if it is one to be indexed
73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75	# skip this document if in "compress-text" mode and asked to delete it
76	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80	# this is another document
81	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
82	$self->{'num_docs'} += 1;
83	}
84	else {
85	$self->{'num_docs'} -= 1;
86	}
87
88	# get the parameters for the output
89	# split on : just in case there is subcoll and lang stuff
90	my ($fields) = split (/:/, $self->{'index'});
91
92	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94	my $levels = $self->{'levels'};
95	my $ldoc_level = $levels->{'document'};
96	my $lsec_level = $levels->{'section'};
97
98	# gs2_id should be depricated #####
99	my $gs2_id = "";
100	if ($ldoc_level)
101	{
102	if ($self->{'db_level'} eq 'document')
103	{
104	$gs2_id = $self->{'num_docs'};
105	}
106	else
107	{
108	# default is section level
109	$gs2_id = $self->{'num_sections'} + 1;
110	}
111	}
112	my $gs2_docOID = $doc_obj->get_OID();
113	my $documenttag = undef;
114	my $documentendtag = undef;
115
116	#$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
118	$documentendtag = "\n</$doc_tag_name>\n";
119
120	my $sec_tag_name = "";
121	if ($lsec_level)
122	{
123	$sec_tag_name = $mgppbuildproc::level_map{'section'};
124	}
125
126	my $doc_section = 0; # just for this document
127
128	my $text = "";
129	$text .= $documenttag;
130	# get the text for this document
131	my $section = $doc_obj->get_top_section();
132	while (defined $section)
133	{
134	# update a few statistics
135	$doc_section++;
136	$self->{'num_sections'}++;
137
138	my $sec_gs2_id = $self->{'num_sections'};
139	my $sec_gs2_docOID = $gs2_docOID;
140	$sec_gs2_docOID .= ".$section" if ($section ne "");
141
142	# if we are doing subcollections, then some docs shouldn't be indexed.
143	# but we need to put the section tag placeholders in there so the
144	# sections match up with database
145	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
146	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
147	if ($sec_tag_name ne "") {
148	#$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
149	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
150	$text .= "\n</$sec_tag_name>\n"
151	}
152	$section = $doc_obj->get_next_section($section);
153	next;
154	}
155
156	if ($sec_tag_name ne "")
157	{
158	#$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
159	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
160	}
161
162	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
163	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
164	}
165	else {
166	# delete
167	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
168	}
169
170
171	# has the user added a 'metadata' index?
172	my $all_metadata_specified = 0;
173	# which fields have already been indexed? (same as fields, but in a map)
174	my $specified_fields = {};
175
176	# do we have an allfields index??
177	my $allfields_index = 0;
178	# collect up all the text for it in here
179	my $allfields_text = "";
180	foreach my $field (split (/;/, $fields)) {
181	if ($field eq "allfields") {
182	$allfields_index = 1;
183	} elsif ($field eq "metadata") {
184	$all_metadata_specified = 1;
185	}
186	}
187
188	foreach my $field (split (/;/, $fields)) {
189
190	# only deal with this field if it doesn't start with top or
191	# this is the first section
192	my $real_field = $field;
193	next if (($real_field =~ s/^top//) && ($doc_section != 1));
194
195	# process these two later
196	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
197
198	#individual metadata and or text specified - could be a comma separated list
199	$specified_fields->{$real_field} = 1;
200	my $shortname="";
201	my $new_field = 0; # have we found a new field name?
202	if (defined $self->{'indexfieldmap'}->{$real_field}) {
203	$shortname = $self->{'indexfieldmap'}->{$real_field};
204	}
205	else {
206	$shortname = $self->create_shortname($real_field);
207	$new_field = 1;
208	}
209
210	my @metadata_list = (); # put any metadata values in here
211	my $section_text = ""; # put the text in here
212	foreach my $submeta (split /,/, $real_field) {
213	if ($submeta eq "text") {
214	# no point in indexing text more than once
215	if ($section_text eq "") {
216	$section_text = $doc_obj->get_text($section);
217	if ($self->{'indexing_text'}) {
218	# we always strip html
219	$section_text = $self->preprocess_text($section_text, 1, "");
220	}
221	else {
222	# leave html stuff in, but escape the tags
223	&ghtml::htmlsafe($section_text);
224	}
225	}
226	}
227	else {
228	$submeta =~ s/^ex\.//; #strip off ex.
229
230	# its a metadata element
231	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
232	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
233	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
234	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
235	}
236	}
237	push (@metadata_list, @section_metadata);
238	}
239	} # for each field in this one index
240
241	# now we add the text and/or metadata into new_text
242	if ($section_text ne "" \|\| scalar(@metadata_list)) {
243	my $new_text = "";
244
245	if ($section_text ne "") {
246	$new_text .= "$section_text ";
247	}
248
249	foreach my $item (@metadata_list) {
250	&ghtml::htmlsafe($item);
251	$new_text .= "$item ";
252	}
253
254	if ($allfields_index) {
255	$allfields_text .= $new_text;
256	}
257
258	if ($self->{'indexing_text'}) {
259	# add the tag
260	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
261	}
262	# filter the text
263	$new_text = $self->filter_text ($field, $new_text);
264
265	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
266	$self->{'num_processed_bytes'} += length ($new_text);
267	$text .= "$new_text";
268	}
269	else {
270	# delete
271	$self->{'num_processed_bytes'} -= length ($new_text);
272	}
273
274
275	if ($self->{'indexing_text'} && $new_field) {
276	# we need to add to the list in indexfields
277
278	$self->{'indexfieldmap'}->{$real_field} = $shortname;
279	$self->{'indexfieldmap'}->{$shortname} = 1;
280	}
281
282	}
283
284	} # foreach field
285
286
287	if ($all_metadata_specified) {
288
289	my $new_text = "";
290	my $shortname = "";
291	my $metadata = $doc_obj->get_all_metadata ($section);
292	foreach my $pair (@$metadata) {
293	my ($mfield, $mvalue) = (@$pair);
294	# no value
295	next unless defined $mvalue && $mvalue ne "";
296	# we have already indexed this
297	next if defined ($specified_fields->{$mfield});
298	# check fields here, maybe others dont want - change to use dontindex!!
299	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
300	next if ($mfield =~ /^gsdl/);
301
302	&ghtml::htmlsafe($mvalue);
303
304	if (defined $self->{'indexfieldmap'}->{$mfield}) {
305	$shortname = $self->{'indexfieldmap'}->{$mfield};
306	}
307	else {
308	$shortname = $self->create_shortname($mfield);
309	$self->{'indexfieldmap'}->{$mfield} = $shortname;
310	$self->{'indexfieldmap'}->{$shortname} = 1;
311	}
312	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
313	if ($allfields_index) {
314	$allfields_text .= "$mvalue ";
315	}
316
317	if (!defined $self->{'indexfields'}->{$mfield}) {
318	$self->{'indexfields'}->{$mfield} = 1;
319	}
320
321	}
322	# filter the text
323	$new_text = $self->filter_text ("metadata", $new_text);
324
325	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
326	$self->{'num_processed_bytes'} += length ($new_text);
327	$text .= "$new_text";
328	}
329	else {
330	# delete
331	$self->{'num_processed_bytes'} -= length ($new_text);
332	}
333	}
334
335	if ($allfields_index) {
336	# add the index name mapping
337	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
338	$self->{'indexfieldmap'}->{"ZZ"} = 1;
339
340	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
341	# filter the text
342	$new_text = $self->filter_text ("allfields", $new_text);
343
344	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
345	$self->{'num_processed_bytes'} += length ($new_text);
346	$text .= "$new_text";
347	}
348	else {
349	# delete
350	$self->{'num_processed_bytes'} -= length ($new_text);
351	}
352	}
353
354	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
355
356	$section = $doc_obj->get_next_section($section);
357	} # while defined section
358
359	print $lucenehandle "$text\n$documentendtag";
360
361	## if ($edit_mode eq "delete") {
362	## print STDERR "$text\n$documentendtag";
363	## }
364
365	}
366
367	sub text {
368	my $self = shift (@_);
369	my ($doc_obj,$file) = @_;
370
371	$self->textedit($doc_obj,$file,"add");
372	}
373
374	sub textreindex
375	{
376	my $self = shift (@_);
377	my ($doc_obj,$file) = @_;
378
379	$self->textedit($doc_obj,$file,"update");
380	}
381
382	sub textdelete
383	{
384	my $self = shift (@_);
385	my ($doc_obj,$file) = @_;
386
387	$self->textedit($doc_obj,$file,"delete");
388	}
389
390
391
392
393
394	# /** We make this builder pretend to be a document processor so we can get
395	# * information back from the plugins.
396	# *
397	# * @param $self A reference to this Lucene builder
398	# * @param $doc_obj A reference to a document object representing what was
399	# * parsed by the GAPlug
400	# * @param $file The name of the file parsed as a string
401	# *
402	# * @author John Thompson, DL Consulting Ltd
403	# */
404	sub process()
405	{
406	my $self = shift (@_);
407	my ($doc_obj, $file) = @_;
408
409	# If this is called from any stage other than an incremental infodb we want
410	# to pass through to the superclass of build
411	if ($self->get_mode() eq "incinfodb")
412	{
413	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
414	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
415	$archivedir = "" unless defined $archivedir;
416	$archivedir =~ s/\\/\//g;
417	$archivedir =~ s/^\/+//;
418	$archivedir =~ s/\/+$//;
419
420	# Number of files
421	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
422
423	# resolve the final filenames of the files associated with this document
424	$self->assoc_files ($doc_obj, $archivedir);
425
426	# is this a paged or a hierarchical document
427	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
428
429	# Determine the actual docnum by checking if we've processed any
430	# previous incrementally added documents. If so, carry on from there.
431	# Otherwise we set the counter to be the same as the number of
432	# sections encountered during the previous build
433	if ($self->{'numincdocs'} == 0)
434	{
435	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
436	}
437
438	my $section = $doc_obj->get_top_section ();
439	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
440	my $doc_OID = $doc_obj->get_OID();
441	my $url = "";
442	while (defined $section)
443	{
444	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
445	# Attach all the other metadata to this document
446	# output the fact that this document is a document (unless doctype
447	# has been set to something else from within a plugin
448	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
449	if (!defined $dtype \|\| $dtype !~ /\w/)
450	{
451	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
452	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
453	}
454	# output whether this node contains text
455	if ($doc_obj->get_text_length($section) > 0)
456	{
457	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
458	}
459	else
460	{
461	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
462	}
463
464	# output archivedir if at top level
465	if ($section eq $doc_obj->get_top_section())
466	{
467	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
468	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
469	}
470
471	# output a list of children
472	my $children = $doc_obj->get_children ($section);
473	if (scalar(@$children) > 0)
474	{
475	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
476	my @contains = ();
477	foreach my $child (@$children)
478	{
479	if ($child =~ /^.*?\.(\d+)$/)
480	{
481	push (@contains, "\".$1");
482	}
483	else
484	{
485	push (@contains, "\".$child");
486	}
487	}
488	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
489	}
490	#output the matching doc number
491	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
492	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
493
494	$self->{'numincdocs'}++;
495	$section = $doc_obj->get_next_section($section);
496	# if no sections wanted, only add the docs
497	last if ($self->{'db_level'} eq "document");
498	}
499	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
500	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
501	}
502	else
503	{
504	$self->mgppbuildproc::process(@_);
505	}
506	}
507	# / process() /
508
509
510	# Following methods seem to be no different to those defined in basebuildproc.pm
511	# From inspection, it looks like these ones can be removed
512
513
514	sub get_num_docs {
515	my $self = shift (@_);
516	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
517	return $self->{'num_docs'};
518	}
519
520	sub get_num_sections {
521	my $self = shift (@_);
522	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
523	return $self->{'num_sections'};
524	}
525
526	# num_bytes is the actual number of bytes in the collection
527	# this is normally the same as what's processed during text compression
528	sub get_num_bytes {
529	my $self = shift (@_);
530	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
531	return $self->{'num_bytes'};
532	}
533
534
535	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
536	# Otherwise the removal of tags below might lead to Lucene turning
537	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
538	# (example from demo collection b20cre)
539	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
540	sub preprocess_text
541	{
542	my $self = shift (@_);
543	my ($text, $strip_html, $para) = @_;
544	# at this stage, we do not do paragraph tags unless have strip_html -
545	# it will result in a huge mess of non-xml
546	return unless $strip_html;
547
548	my $new_text = $text;
549
550	# if we have <pre> tags, we can have < > inside them, need to delete
551	# the <> before stripping tags
552	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
553
554	if ($para eq "") {
555	# just remove all tags
556	$new_text =~ s/<[^>]*>/ /gs;
557	} else {
558	# strip all tags except <p> tags which get turned into $para
559	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
560	}
561
562	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
563	# may not be valid XML (eg. if HTML-only entities like   are used)
564	$new_text =~ s/&\w{1,10};//g;
565	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
566	$new_text =~ s/&([^\#])/ $1/g;
567
568	return $new_text;
569	}
570
571	sub delete_assoc_files
572	{
573	my $self = shift (@_);
574	my ($archivedir) = @_;
575
576	$self->basebuildproc::delete_assoc_files(@_);
577
578	my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
579	if (-d $assoc_dir) {
580	&util::rm_r($assoc_dir);
581	}
582	}
583
584	1;
585
586

Note: See TracBrowser for help on using the repository browser.

Download in other formats: