Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 28355

Last change on this file since 28355 was 28035, checked in by kjdon, 11 years ago
handle sort field none as well as rank
Property svn:keywords set to `Author Date Id Revision`
File size: 21.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51	$self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
52	$self->{'allfields_index'} = 0; # do we need allfields index?
53	$self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
54	$self->{'actualsortfields'} = {}; # sort fields that have actually been used
55	$self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
56	return bless $self, $class;
57	}
58
59	sub set_index {
60	my $self = shift (@_);
61	my ($index, $indexexparr) = @_;
62
63	$self->mgppbuildproc::set_index($index, $indexexparr);
64
65	# just get the list of index fields without any subcoll stuff
66	my ($fields) = split (/:/, $self->{'index'});
67
68	foreach my $field (split (/;/, $fields)) {
69	if ($field eq "allfields") {
70	$self->{'allfields_index'} = 1;
71	} elsif ($field eq "metadata") {
72	$self->{'all_metadata_specified'} = 1;
73	} else {
74	$field =~ s/^top//;
75	$self->{'specified_fields'} ->{$field} = 1;
76	}
77	}
78	}
79
80	sub set_sections_sort_on_document_metadata {
81	my $self= shift (@_);
82	my ($index_type) = @_;
83
84	$self->{'sections_sort_on_document_metadata'} = $index_type;
85	}
86
87	sub set_sortfields {
88	my $self = shift (@_);
89
90	my ($sortfields) = @_;
91	$self->{'sortfields'} = ();
92	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
93	foreach my $s (@$sortfields) {
94	if ($s !~ /^(text\|allfields\|metadata)$/) {
95	push (@{$self->{'sortfields'}}, $s);
96	}
97	}
98	}
99
100	sub is_incremental_capable
101	{
102	my $self = shift (@_);
103
104	# Unlike MG and MGPP, Lucene supports incremental building
105	return 1;
106	}
107
108
109	sub textedit {
110	my $self = shift (@_);
111	my ($doc_obj,$file,$edit_mode) = @_;
112
113	my $lucenehandle = $self->{'output_handle'};
114	my $outhandle = $self->{'outhandle'};
115
116	# only output this document if it is one to be indexed
117	return if ($doc_obj->get_doc_type() ne "indexed_doc");
118
119	# skip this document if in "compress-text" mode and asked to delete it
120	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
121
122	# 0/1 to indicate whether this doc is part of the specified subcollection
123	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
124
125	# this is another document
126	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
127	$self->{'num_docs'} += 1;
128	}
129	else {
130	$self->{'num_docs'} -= 1;
131	}
132
133
134	# get the parameters for the output
135	# split on : just in case there is subcoll and lang stuff
136	my ($fields) = split (/:/, $self->{'index'});
137
138	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
139
140	my $levels = $self->{'levels'};
141	my $ldoc_level = $levels->{'document'};
142	my $lsec_level = $levels->{'section'};
143
144	my $gs2_docOID = $doc_obj->get_OID();
145	my $documenttag = undef;
146	my $documentendtag = undef;
147
148	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
149	$documentendtag = "\n</$doc_tag_name>\n";
150
151	my $sec_tag_name = "";
152	if ($lsec_level)
153	{
154	$sec_tag_name = $mgppbuildproc::level_map{'section'};
155	}
156
157	my $doc_section = 0; # just for this document
158
159	my $text = "";
160	$text .= $documenttag;
161	# get the text for this document
162	my $section = $doc_obj->get_top_section();
163	while (defined $section)
164	{
165	# update a few statistics
166	$doc_section++;
167	$self->{'num_sections'}++;
168
169	my $sec_gs2_id = $self->{'num_sections'};
170	my $sec_gs2_docOID = $gs2_docOID;
171	$sec_gs2_docOID .= ".$section" if ($section ne "");
172
173	# if we are doing subcollections, then some docs shouldn't be indexed.
174	# but we need to put the section tag placeholders in there so the
175	# sections match up with database
176	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
177	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
178	if ($sec_tag_name ne "") {
179	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
180	$text .= "\n</$sec_tag_name>\n"
181	}
182	$section = $doc_obj->get_next_section($section);
183	next;
184	}
185
186	if ($sec_tag_name ne "")
187	{
188	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
189	}
190
191	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
192	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
193	}
194	else {
195	# delete
196	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
197	}
198
199
200	# collect up all the text for allfields index in here (if there is one)
201	my $allfields_text = "";
202
203	foreach my $field (split (/;/, $fields)) {
204
205	# only deal with this field if it doesn't start with top or
206	# this is the first section
207	my $real_field = $field;
208	next if (($real_field =~ s/^top//) && ($doc_section != 1));
209
210	# process these two later
211	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
212
213	#individual metadata and or text specified - could be a comma separated list
214	#$specified_fields->{$real_field} = 1;
215	my $shortname="";
216	my $new_field = 0; # have we found a new field name?
217	if (defined $self->{'fieldnamemap'}->{$real_field}) {
218	$shortname = $self->{'fieldnamemap'}->{$real_field};
219	} else {
220	$shortname = $self->create_shortname($real_field);
221	$self->{'fieldnamemap'}->{$real_field} = $shortname;
222	$self->{'fieldnamemap'}->{$shortname} = 1;
223	}
224	my @metadata_list = (); # put any metadata values in here
225	my $section_text = ""; # put the text in here
226	foreach my $submeta (split /,/, $real_field) {
227	if ($submeta eq "text") {
228	# no point in indexing text more than once
229	if ($section_text eq "") {
230	$section_text = $doc_obj->get_text($section);
231	if ($self->{'indexing_text'}) {
232	# we always strip html
233	$section_text = $self->preprocess_text($section_text, 1, "");
234	}
235	else {
236	# leave html stuff in, but escape the tags
237	&ghtml::htmlsafe($section_text);
238	}
239	}
240	}
241	else {
242	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
243
244	# its a metadata element
245	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
246	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
247	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
248	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
249	}
250	}
251	push (@metadata_list, @section_metadata);
252	}
253	} # for each field in this one index
254
255
256	# now we add the text and/or metadata into new_text
257	if ($section_text ne "" \|\| scalar(@metadata_list)) {
258	my $new_text = "";
259
260	if ($section_text ne "") {
261	$new_text .= "$section_text ";
262	}
263
264	foreach my $item (@metadata_list) {
265	&ghtml::htmlsafe($item);
266	$new_text .= "$item ";
267	}
268
269	if ($self->{'allfields_index'}) {
270	$allfields_text .= $new_text;
271	}
272
273	if ($self->{'indexing_text'}) {
274	# add the tag
275	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
276	$self->{'allindexfields'}->{$real_field} = 1;
277	}
278	# filter the text
279	$new_text = $self->filter_text ($field, $new_text);
280
281	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
282	$self->{'num_processed_bytes'} += length ($new_text);
283	$text .= "$new_text";
284	}
285	else {
286	# delete
287	$self->{'num_processed_bytes'} -= length ($new_text);
288	}
289	}
290
291	} # foreach field
292
293	if ($self->{'all_metadata_specified'}) {
294
295	my $new_text = "";
296	my $shortname = "";
297	my $metadata = $doc_obj->get_all_metadata ($section);
298	foreach my $pair (@$metadata) {
299	my ($mfield, $mvalue) = (@$pair);
300	# no value
301	next unless defined $mvalue && $mvalue ne "";
302	# we have already indexed this
303	next if defined ($self->{'specified_fields'}->{$mfield});
304	# check fields here, maybe others dont want - change to use dontindex!!
305	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
306	next if ($mfield =~ /^gsdl/);
307
308	&ghtml::htmlsafe($mvalue);
309
310	if (defined $self->{'fieldnamemap'}->{$mfield}) {
311	$shortname = $self->{'fieldnamemap'}->{$mfield};
312	}
313	else {
314	$shortname = $self->create_shortname($mfield);
315	$self->{'fieldnamemap'}->{$mfield} = $shortname;
316	$self->{'fieldnamemap'}->{$shortname} = 1;
317	}
318	$self->{'allindexfields'}->{$mfield} = 1;
319	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
320	if ($self->{'allfields_index'}) {
321	$allfields_text .= "$mvalue ";
322	}
323
324	if (!defined $self->{'extraindexfields'}->{$mfield}) {
325	$self->{'extraindexfields'}->{$mfield} = 1;
326	}
327
328	}
329	# filter the text
330	$new_text = $self->filter_text ("metadata", $new_text);
331
332	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
333	$self->{'num_processed_bytes'} += length ($new_text);
334	$text .= "$new_text";
335	}
336	else {
337	# delete
338	$self->{'num_processed_bytes'} -= length ($new_text);
339	}
340	}
341
342	if ($self->{'allfields_index'}) {
343
344	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
345	# filter the text
346	$new_text = $self->filter_text ("allfields", $new_text);
347
348	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
349	$self->{'num_processed_bytes'} += length ($new_text);
350	$text .= "$new_text";
351	}
352	else {
353	# delete
354	$self->{'num_processed_bytes'} -= length ($new_text);
355	}
356	}
357	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
358	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
359	# add sort fields if there are any
360
361	foreach my $sfield (@{$self->{'sortfields'}}) {
362	# ignore special field rank
363	next if ($sfield eq "rank" \|\| $sfield eq "none");
364	my $sf_shortname;
365	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
366	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
367	}
368	else {
369	$sf_shortname = $self->create_sortfield_shortname($sfield);
370	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
371	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
372	}
373	my @metadata_list = (); # put any metadata values in here
374	foreach my $submeta (split /,/, $sfield) {
375	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
376
377	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
378	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
379	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
380	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
381	}
382	}
383	push (@metadata_list, @section_metadata);
384	}
385	my $new_text = "";
386	foreach my $item (@metadata_list) {
387	&ghtml::htmlsafe($item);
388	$new_text .= "$item";
389	}
390	if ($new_text =~ /\S/) {
391	$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
392	# filter the text???
393	$text .= "$new_text"; # add it to the main text block
394	$self->{'actualsortfields'}->{$sfield} = 1;
395	}
396	}
397	}
398	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
399
400	$section = $doc_obj->get_next_section($section);
401	} # for each section
402
403	#open (TEXTOUT, ">text.out");
404	#print TEXTOUT "$text\n$documentendtag";
405	#close TEXTOUT;
406
407	print $lucenehandle "$text\n$documentendtag";
408
409	## if ($edit_mode eq "delete") {
410	## print STDERR "$text\n$documentendtag";
411	## }
412
413	}
414
415	sub text {
416	my $self = shift (@_);
417	my ($doc_obj,$file) = @_;
418
419	$self->textedit($doc_obj,$file,"add");
420	}
421
422	sub textreindex
423	{
424	my $self = shift (@_);
425	my ($doc_obj,$file) = @_;
426
427	$self->textedit($doc_obj,$file,"update");
428	}
429
430	sub textdelete
431	{
432	my $self = shift (@_);
433	my ($doc_obj,$file) = @_;
434
435	$self->textedit($doc_obj,$file,"delete");
436	}
437
438
439
440
441
442	# /** We make this builder pretend to be a document processor so we can get
443	# * information back from the plugins.
444	# *
445	# * @param $self A reference to this Lucene builder
446	# * @param $doc_obj A reference to a document object representing what was
447	# * parsed by the GAPlug
448	# * @param $file The name of the file parsed as a string
449	# *
450	# * @author John Thompson, DL Consulting Ltd
451	# */
452	sub process()
453	{
454	my $self = shift (@_);
455	my ($doc_obj, $file) = @_;
456
457	# If this is called from any stage other than an incremental infodb we want
458	# to pass through to the superclass of build
459	if ($self->get_mode() eq "incinfodb")
460	{
461	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
462	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
463	$archivedir = "" unless defined $archivedir;
464	$archivedir =~ s/\\/\//g;
465	$archivedir =~ s/^\/+//;
466	$archivedir =~ s/\/+$//;
467
468	# Number of files
469	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
470
471	# resolve the final filenames of the files associated with this document
472	$self->assoc_files ($doc_obj, $archivedir);
473
474	# is this a paged or a hierarchical document
475	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
476
477	# Determine the actual docnum by checking if we've processed any
478	# previous incrementally added documents. If so, carry on from there.
479	# Otherwise we set the counter to be the same as the number of
480	# sections encountered during the previous build
481	if ($self->{'numincdocs'} == 0)
482	{
483	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
484	}
485
486	my $section = $doc_obj->get_top_section ();
487	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
488	my $doc_OID = $doc_obj->get_OID();
489	my $url = "";
490	while (defined $section)
491	{
492	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
493	# Attach all the other metadata to this document
494	# output the fact that this document is a document (unless doctype
495	# has been set to something else from within a plugin
496	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
497	if (!defined $dtype \|\| $dtype !~ /\w/)
498	{
499	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
500	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
501	}
502	# output whether this node contains text
503	if ($doc_obj->get_text_length($section) > 0)
504	{
505	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
506	}
507	else
508	{
509	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
510	}
511
512	# output archivedir if at top level
513	if ($section eq $doc_obj->get_top_section())
514	{
515	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
516	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
517	}
518
519	# output a list of children
520	my $children = $doc_obj->get_children ($section);
521	if (scalar(@$children) > 0)
522	{
523	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
524	my @contains = ();
525	foreach my $child (@$children)
526	{
527	if ($child =~ /^.*?\.(\d+)$/)
528	{
529	push (@contains, "\".$1");
530	}
531	else
532	{
533	push (@contains, "\".$child");
534	}
535	}
536	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
537	}
538	#output the matching doc number
539	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
540	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
541
542	$self->{'numincdocs'}++;
543	$section = $doc_obj->get_next_section($section);
544	# if no sections wanted, only add the docs
545	last if ($self->{'db_level'} eq "document");
546	}
547	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
548	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
549	}
550	else
551	{
552	$self->mgppbuildproc::process(@_);
553	}
554	}
555	# / process() /
556
557
558	# Following methods seem to be no different to those defined in basebuildproc.pm
559	# From inspection, it looks like these ones can be removed
560
561
562	sub get_num_docs {
563	my $self = shift (@_);
564	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
565	return $self->{'num_docs'};
566	}
567
568	sub get_num_sections {
569	my $self = shift (@_);
570	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
571	return $self->{'num_sections'};
572	}
573
574	# num_bytes is the actual number of bytes in the collection
575	# this is normally the same as what's processed during text compression
576	sub get_num_bytes {
577	my $self = shift (@_);
578	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
579	return $self->{'num_bytes'};
580	}
581
582
583	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
584	# Otherwise the removal of tags below might lead to Lucene turning
585	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
586	# (example from demo collection b20cre)
587	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
588	sub preprocess_text
589	{
590	my $self = shift (@_);
591	my ($text, $strip_html, $para) = @_;
592	# at this stage, we do not do paragraph tags unless have strip_html -
593	# it will result in a huge mess of non-xml
594	return unless $strip_html;
595
596	my $new_text = $text;
597
598	# if we have <pre> tags, we can have < > inside them, need to delete
599	# the <> before stripping tags
600	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
601
602	if ($para eq "") {
603	# just remove all tags
604	$new_text =~ s/<[^>]*>/ /gs;
605	} else {
606	# strip all tags except <p> tags which get turned into $para
607	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
608	}
609
610	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
611	# may not be valid XML (eg. if HTML-only entities like   are used)
612	$new_text =~ s/&\w{1,10};//g;
613	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
614	$new_text =~ s/&([^\#])/ $1/g;
615
616	return $new_text;
617	}
618
619	sub delete_assoc_files
620	{
621	my $self = shift (@_);
622	my ($archivedir, $edit_mode) = @_;
623
624	$self->basebuildproc::delete_assoc_files(@_);
625
626	if ($edit_mode eq "delete") {
627	# if we are deleting the doc, then also delete the lucene text version
628	my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
629	if (-d $assoc_dir) {
630	&util::rm_r($assoc_dir);
631	}
632	}
633	}
634
635	sub create_sortfield_shortname {
636	my $self = shift(@_);
637
638	my ($realname) = @_;
639
640	my $index_shortname;
641	# if we have created a shortname for an index on this field, then use it.
642	if (defined $self->{'fieldnamemap'}->{$realname}) {
643	$index_shortname = $self->{'fieldnamemap'}->{$realname};
644	} else {
645	$index_shortname = $self->create_shortname($realname);
646	}
647	return "by".$index_shortname;
648	}
649
650
651	1;
652
653

Note: See TracBrowser for help on using the repository browser.

Download in other formats: