Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 27565

Last change on this file since 27565 was 27565, checked in by kjdon, 11 years ago
ignore special keywords which should be only in indexes list, and ignore sort special keyword 'rank'
Property svn:keywords set to `Author Date Id Revision`
File size: 21.3 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40
41	sub BEGIN {
42	@lucenebuildproc::ISA = ('mgppbuildproc');
43	}
44
45
46	sub new {
47	my $class = shift @_;
48	my $self = new mgppbuildproc (@_);
49
50	$self->{'numincdocs'} = 0;
51	$self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
52	$self->{'allfields_index'} = 0; # do we need allfields index?
53	$self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
54	$self->{'actualsortfields'} = {}; # sort fields that have actually been used
55	$self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
56	return bless $self, $class;
57	}
58
59	sub set_index {
60	my $self = shift (@_);
61	my ($index, $indexexparr) = @_;
62
63	$self->mgppbuildproc::set_index($index, $indexexparr);
64
65	# just get the list of index fields without any subcoll stuff
66	my ($fields) = split (/:/, $self->{'index'});
67
68	foreach my $field (split (/;/, $fields)) {
69	if ($field eq "allfields") {
70	$self->{'allfields_index'} = 1;
71	} elsif ($field eq "metadata") {
72	$self->{'all_metadata_specified'} = 1;
73	} else {
74	$field =~ s/^top//;
75	$self->{'specified_fields'} ->{$field} = 1;
76	}
77	}
78	}
79
80	sub set_sections_sort_on_document_metadata {
81	my $self= shift (@_);
82	my ($index_type) = @_;
83
84	$self->{'sections_sort_on_document_metadata'} = $index_type;
85	}
86
87	sub set_sortfields {
88	my $self = shift (@_);
89
90	my ($sortfields) = @_;
91	$self->{'sortfields'} = ();
92	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
93	foreach my $s (@$sortfields) {
94	if ($s !~ /^(text\|allfields\|metadata)$/) {
95	push (@{$self->{'sortfields'}}, $s);
96	}
97	}
98	}
99
100	sub is_incremental_capable
101	{
102	my $self = shift (@_);
103
104	# Unlike MG and MGPP, Lucene supports incremental building
105	return 1;
106	}
107
108
109	sub textedit {
110	my $self = shift (@_);
111	my ($doc_obj,$file,$edit_mode) = @_;
112
113	my $lucenehandle = $self->{'output_handle'};
114	my $outhandle = $self->{'outhandle'};
115
116	# only output this document if it is one to be indexed
117	return if ($doc_obj->get_doc_type() ne "indexed_doc");
118
119	# skip this document if in "compress-text" mode and asked to delete it
120	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
121
122	# 0/1 to indicate whether this doc is part of the specified subcollection
123	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
124
125	# this is another document
126	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
127	$self->{'num_docs'} += 1;
128	}
129	else {
130	$self->{'num_docs'} -= 1;
131	}
132
133
134	# get the parameters for the output
135	# split on : just in case there is subcoll and lang stuff
136	my ($fields) = split (/:/, $self->{'index'});
137
138	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
139
140	my $levels = $self->{'levels'};
141	my $ldoc_level = $levels->{'document'};
142	my $lsec_level = $levels->{'section'};
143
144	my $gs2_docOID = $doc_obj->get_OID();
145	my $documenttag = undef;
146	my $documentendtag = undef;
147
148	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
149	$documentendtag = "\n</$doc_tag_name>\n";
150
151	my $sec_tag_name = "";
152	if ($lsec_level)
153	{
154	$sec_tag_name = $mgppbuildproc::level_map{'section'};
155	}
156
157	my $doc_section = 0; # just for this document
158
159	my $text = "";
160	$text .= $documenttag;
161	# get the text for this document
162	my $section = $doc_obj->get_top_section();
163	while (defined $section)
164	{
165	# update a few statistics
166	$doc_section++;
167	$self->{'num_sections'}++;
168
169	my $sec_gs2_id = $self->{'num_sections'};
170	my $sec_gs2_docOID = $gs2_docOID;
171	$sec_gs2_docOID .= ".$section" if ($section ne "");
172
173	# if we are doing subcollections, then some docs shouldn't be indexed.
174	# but we need to put the section tag placeholders in there so the
175	# sections match up with database
176	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
177	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
178	if ($sec_tag_name ne "") {
179	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
180	$text .= "\n</$sec_tag_name>\n"
181	}
182	$section = $doc_obj->get_next_section($section);
183	next;
184	}
185
186	if ($sec_tag_name ne "")
187	{
188	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
189	}
190
191	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
192	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
193	}
194	else {
195	# delete
196	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
197	}
198
199
200	# collect up all the text for allfields index in here (if there is one)
201	my $allfields_text = "";
202
203	foreach my $field (split (/;/, $fields)) {
204
205	# only deal with this field if it doesn't start with top or
206	# this is the first section
207	my $real_field = $field;
208	next if (($real_field =~ s/^top//) && ($doc_section != 1));
209
210	# process these two later
211	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
212
213	#individual metadata and or text specified - could be a comma separated list
214	#$specified_fields->{$real_field} = 1;
215	my $shortname="";
216	my $new_field = 0; # have we found a new field name?
217	if (defined $self->{'fieldnamemap'}->{$real_field}) {
218	$shortname = $self->{'fieldnamemap'}->{$real_field};
219	} else {
220	$shortname = $self->create_shortname($real_field);
221	$self->{'fieldnamemap'}->{$real_field} = $shortname;
222	$self->{'fieldnamemap'}->{$shortname} = 1;
223	}
224	my @metadata_list = (); # put any metadata values in here
225	my $section_text = ""; # put the text in here
226	foreach my $submeta (split /,/, $real_field) {
227	if ($submeta eq "text") {
228	# no point in indexing text more than once
229	if ($section_text eq "") {
230	$section_text = $doc_obj->get_text($section);
231	if ($self->{'indexing_text'}) {
232	# we always strip html
233	$section_text = $self->preprocess_text($section_text, 1, "");
234	}
235	else {
236	# leave html stuff in, but escape the tags
237	&ghtml::htmlsafe($section_text);
238	}
239	}
240	}
241	else {
242	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
243
244	# its a metadata element
245	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
246	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
247	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
248	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
249	}
250	}
251	push (@metadata_list, @section_metadata);
252	}
253	} # for each field in this one index
254
255
256	# now we add the text and/or metadata into new_text
257	if ($section_text ne "" \|\| scalar(@metadata_list)) {
258	my $new_text = "";
259
260	if ($section_text ne "") {
261	$new_text .= "$section_text ";
262	}
263
264	foreach my $item (@metadata_list) {
265	&ghtml::htmlsafe($item);
266	$new_text .= "$item ";
267	}
268
269	if ($self->{'allfields_index'}) {
270	$allfields_text .= $new_text;
271	}
272
273	if ($self->{'indexing_text'}) {
274	# add the tag
275	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
276	$self->{'allindexfields'}->{$real_field} = 1;
277	}
278	# filter the text
279	$new_text = $self->filter_text ($field, $new_text);
280
281	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
282	$self->{'num_processed_bytes'} += length ($new_text);
283	$text .= "$new_text";
284	}
285	else {
286	# delete
287	$self->{'num_processed_bytes'} -= length ($new_text);
288	}
289	}
290
291	} # foreach field
292
293	if ($self->{'all_metadata_specified'}) {
294
295	my $new_text = "";
296	my $shortname = "";
297	my $metadata = $doc_obj->get_all_metadata ($section);
298	foreach my $pair (@$metadata) {
299	my ($mfield, $mvalue) = (@$pair);
300	# no value
301	next unless defined $mvalue && $mvalue ne "";
302	# we have already indexed this
303	next if defined ($self->{'specified_fields'}->{$mfield});
304	# check fields here, maybe others dont want - change to use dontindex!!
305	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
306	next if ($mfield =~ /^gsdl/);
307
308	&ghtml::htmlsafe($mvalue);
309
310	if (defined $self->{'fieldnamemap'}->{$mfield}) {
311	$shortname = $self->{'fieldnamemap'}->{$mfield};
312	}
313	else {
314	$shortname = $self->create_shortname($mfield);
315	$self->{'fieldnamemap'}->{$mfield} = $shortname;
316	$self->{'fieldnamemap'}->{$shortname} = 1;
317	}
318	$self->{'allindexfields'}->{$mfield} = 1;
319	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
320	if ($self->{'allfields_index'}) {
321	$allfields_text .= "$mvalue ";
322	}
323
324	if (!defined $self->{'extraindexfields'}->{$mfield}) {
325	$self->{'extraindexfields'}->{$mfield} = 1;
326	}
327
328	}
329	# filter the text
330	$new_text = $self->filter_text ("metadata", $new_text);
331
332	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
333	$self->{'num_processed_bytes'} += length ($new_text);
334	$text .= "$new_text";
335	}
336	else {
337	# delete
338	$self->{'num_processed_bytes'} -= length ($new_text);
339	}
340	}
341
342	if ($self->{'allfields_index'}) {
343
344	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
345	# filter the text
346	$new_text = $self->filter_text ("allfields", $new_text);
347
348	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
349	$self->{'num_processed_bytes'} += length ($new_text);
350	$text .= "$new_text";
351	}
352	else {
353	# delete
354	$self->{'num_processed_bytes'} -= length ($new_text);
355	}
356	}
357	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
358	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
359	# add sort fields if there are any
360
361	foreach my $sfield (@{$self->{'sortfields'}}) {
362	# ignore special field rank
363	next if $sfield eq "rank";
364	my $sf_shortname;
365	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
366	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
367	}
368	else {
369	$sf_shortname = $self->create_sortfield_shortname($sfield);
370	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
371	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
372	}
373	my @metadata_list = (); # put any metadata values in here
374	foreach my $submeta (split /,/, $sfield) {
375	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
376
377	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
378	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
379	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
380	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
381	}
382	}
383	push (@metadata_list, @section_metadata);
384	}
385	my $new_text = "";
386	foreach my $item (@metadata_list) {
387	&ghtml::htmlsafe($item);
388	$new_text .= "$item";
389	}
390	if ($new_text =~ /\S/) {
391	$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
392	# filter the text???
393	$text .= "$new_text"; # add it to the main text block
394	$self->{'actualsortfields'}->{$sfield} = 1;
395	}
396	}
397	}
398	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
399
400	$section = $doc_obj->get_next_section($section);
401	} # for each section
402
403	#open (TEXTOUT, ">text.out");
404	#print TEXTOUT "$text\n$documentendtag";
405	#close TEXTOUT;
406
407	print $lucenehandle "$text\n$documentendtag";
408
409	## if ($edit_mode eq "delete") {
410	## print STDERR "$text\n$documentendtag";
411	## }
412
413	}
414
415	sub text {
416	my $self = shift (@_);
417	my ($doc_obj,$file) = @_;
418
419	$self->textedit($doc_obj,$file,"add");
420	}
421
422	sub textreindex
423	{
424	my $self = shift (@_);
425	my ($doc_obj,$file) = @_;
426
427	$self->textedit($doc_obj,$file,"update");
428	}
429
430	sub textdelete
431	{
432	my $self = shift (@_);
433	my ($doc_obj,$file) = @_;
434
435	$self->textedit($doc_obj,$file,"delete");
436	}
437
438
439
440
441
442	# /** We make this builder pretend to be a document processor so we can get
443	# * information back from the plugins.
444	# *
445	# * @param $self A reference to this Lucene builder
446	# * @param $doc_obj A reference to a document object representing what was
447	# * parsed by the GAPlug
448	# * @param $file The name of the file parsed as a string
449	# *
450	# * @author John Thompson, DL Consulting Ltd
451	# */
452	sub process()
453	{
454	my $self = shift (@_);
455	my ($doc_obj, $file) = @_;
456
457	# If this is called from any stage other than an incremental infodb we want
458	# to pass through to the superclass of build
459	if ($self->get_mode() eq "incinfodb")
460	{
461	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
462	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
463	$archivedir = "" unless defined $archivedir;
464	$archivedir =~ s/\\/\//g;
465	$archivedir =~ s/^\/+//;
466	$archivedir =~ s/\/+$//;
467
468	# Number of files
469	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
470
471	# resolve the final filenames of the files associated with this document
472	$self->assoc_files ($doc_obj, $archivedir);
473
474	# is this a paged or a hierarchical document
475	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
476
477	# Determine the actual docnum by checking if we've processed any
478	# previous incrementally added documents. If so, carry on from there.
479	# Otherwise we set the counter to be the same as the number of
480	# sections encountered during the previous build
481	if ($self->{'numincdocs'} == 0)
482	{
483	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
484	}
485
486	my $section = $doc_obj->get_top_section ();
487	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
488	my $doc_OID = $doc_obj->get_OID();
489	my $url = "";
490	while (defined $section)
491	{
492	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
493	# Attach all the other metadata to this document
494	# output the fact that this document is a document (unless doctype
495	# has been set to something else from within a plugin
496	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
497	if (!defined $dtype \|\| $dtype !~ /\w/)
498	{
499	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
500	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
501	}
502	# output whether this node contains text
503	if ($doc_obj->get_text_length($section) > 0)
504	{
505	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
506	}
507	else
508	{
509	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
510	}
511
512	# output archivedir if at top level
513	if ($section eq $doc_obj->get_top_section())
514	{
515	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
516	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
517	}
518
519	# output a list of children
520	my $children = $doc_obj->get_children ($section);
521	if (scalar(@$children) > 0)
522	{
523	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
524	my @contains = ();
525	foreach my $child (@$children)
526	{
527	if ($child =~ /^.*?\.(\d+)$/)
528	{
529	push (@contains, "\".$1");
530	}
531	else
532	{
533	push (@contains, "\".$child");
534	}
535	}
536	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
537	}
538	#output the matching doc number
539	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
540	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
541
542	$self->{'numincdocs'}++;
543	$section = $doc_obj->get_next_section($section);
544	# if no sections wanted, only add the docs
545	last if ($self->{'db_level'} eq "document");
546	}
547	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
548	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
549	}
550	else
551	{
552	$self->mgppbuildproc::process(@_);
553	}
554	}
555	# / process() /
556
557
558	# Following methods seem to be no different to those defined in basebuildproc.pm
559	# From inspection, it looks like these ones can be removed
560
561
562	sub get_num_docs {
563	my $self = shift (@_);
564	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
565	return $self->{'num_docs'};
566	}
567
568	sub get_num_sections {
569	my $self = shift (@_);
570	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
571	return $self->{'num_sections'};
572	}
573
574	# num_bytes is the actual number of bytes in the collection
575	# this is normally the same as what's processed during text compression
576	sub get_num_bytes {
577	my $self = shift (@_);
578	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
579	return $self->{'num_bytes'};
580	}
581
582
583	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
584	# Otherwise the removal of tags below might lead to Lucene turning
585	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
586	# (example from demo collection b20cre)
587	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
588	sub preprocess_text
589	{
590	my $self = shift (@_);
591	my ($text, $strip_html, $para) = @_;
592	# at this stage, we do not do paragraph tags unless have strip_html -
593	# it will result in a huge mess of non-xml
594	return unless $strip_html;
595
596	my $new_text = $text;
597
598	# if we have <pre> tags, we can have < > inside them, need to delete
599	# the <> before stripping tags
600	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
601
602	if ($para eq "") {
603	# just remove all tags
604	$new_text =~ s/<[^>]*>/ /gs;
605	} else {
606	# strip all tags except <p> tags which get turned into $para
607	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
608	}
609
610	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
611	# may not be valid XML (eg. if HTML-only entities like   are used)
612	$new_text =~ s/&\w{1,10};//g;
613	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
614	$new_text =~ s/&([^\#])/ $1/g;
615
616	return $new_text;
617	}
618
619	sub delete_assoc_files
620	{
621	my $self = shift (@_);
622	my ($archivedir, $edit_mode) = @_;
623
624	$self->basebuildproc::delete_assoc_files(@_);
625
626	if ($edit_mode eq "delete") {
627	# if we are deleting the doc, then also delete the lucene text version
628	my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
629	if (-d $assoc_dir) {
630	&util::rm_r($assoc_dir);
631	}
632	}
633	}
634
635	sub create_sortfield_shortname {
636	my $self = shift(@_);
637
638	my ($realname) = @_;
639
640	my $index_shortname;
641	# if we have created a shortname for an index on this field, then use it.
642	if (defined $self->{'fieldnamemap'}->{$realname}) {
643	$index_shortname = $self->{'fieldnamemap'}->{$realname};
644	} else {
645	$index_shortname = $self->create_shortname($realname);
646	}
647	return "by".$index_shortname;
648	}
649
650
651	1;
652
653

Note: See TracBrowser for help on using the repository browser.

Download in other formats: