Context Navigation

source: main/trunk/greenstone2/perllib/lucenebuildproc.pm@ 29820

Last change on this file since 29820 was 28566, checked in by kjdon, 11 years ago
changed some util:: to FIleUtils:: methods to avoid warnings
Property svn:keywords set to `Author Date Id Revision`
File size: 21.4 KB

Line
1	###########################################################################
2	#
3	# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package lucenebuildproc;
27
28	# This document processor outputs a document
29	# for lucene to process
30
31	# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33	use mgppbuildproc;
34	use ghtml;
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39	use IncrementalBuildUtils;
40	use FileUtils;
41
42	sub BEGIN {
43	@lucenebuildproc::ISA = ('mgppbuildproc');
44	}
45
46
47	sub new {
48	my $class = shift @_;
49	my $self = new mgppbuildproc (@_);
50
51	$self->{'numincdocs'} = 0;
52	$self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
53	$self->{'allfields_index'} = 0; # do we need allfields index?
54	$self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
55	$self->{'actualsortfields'} = {}; # sort fields that have actually been used
56	$self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
57	return bless $self, $class;
58	}
59
60	sub set_index {
61	my $self = shift (@_);
62	my ($index, $indexexparr) = @_;
63
64	$self->mgppbuildproc::set_index($index, $indexexparr);
65
66	# just get the list of index fields without any subcoll stuff
67	my ($fields) = split (/:/, $self->{'index'});
68
69	foreach my $field (split (/;/, $fields)) {
70	if ($field eq "allfields") {
71	$self->{'allfields_index'} = 1;
72	} elsif ($field eq "metadata") {
73	$self->{'all_metadata_specified'} = 1;
74	} else {
75	$field =~ s/^top//;
76	$self->{'specified_fields'} ->{$field} = 1;
77	}
78	}
79	}
80
81	sub set_sections_sort_on_document_metadata {
82	my $self= shift (@_);
83	my ($index_type) = @_;
84
85	$self->{'sections_sort_on_document_metadata'} = $index_type;
86	}
87
88	sub set_sortfields {
89	my $self = shift (@_);
90
91	my ($sortfields) = @_;
92	$self->{'sortfields'} = ();
93	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
94	foreach my $s (@$sortfields) {
95	if ($s !~ /^(text\|allfields\|metadata)$/) {
96	push (@{$self->{'sortfields'}}, $s);
97	}
98	}
99	}
100
101	sub is_incremental_capable
102	{
103	my $self = shift (@_);
104
105	# Unlike MG and MGPP, Lucene supports incremental building
106	return 1;
107	}
108
109
110	sub textedit {
111	my $self = shift (@_);
112	my ($doc_obj,$file,$edit_mode) = @_;
113
114	my $lucenehandle = $self->{'output_handle'};
115	my $outhandle = $self->{'outhandle'};
116
117	# only output this document if it is one to be indexed
118	return if ($doc_obj->get_doc_type() ne "indexed_doc");
119
120	# skip this document if in "compress-text" mode and asked to delete it
121	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
122
123	# 0/1 to indicate whether this doc is part of the specified subcollection
124	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
125
126	# this is another document
127	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
128	$self->{'num_docs'} += 1;
129	}
130	else {
131	$self->{'num_docs'} -= 1;
132	}
133
134
135	# get the parameters for the output
136	# split on : just in case there is subcoll and lang stuff
137	my ($fields) = split (/:/, $self->{'index'});
138
139	my $doc_tag_name = $mgppbuildproc::level_map{'document'};
140
141	my $levels = $self->{'levels'};
142	my $ldoc_level = $levels->{'document'};
143	my $lsec_level = $levels->{'section'};
144
145	my $gs2_docOID = $doc_obj->get_OID();
146	my $documenttag = undef;
147	my $documentendtag = undef;
148
149	$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
150	$documentendtag = "\n</$doc_tag_name>\n";
151
152	my $sec_tag_name = "";
153	if ($lsec_level)
154	{
155	$sec_tag_name = $mgppbuildproc::level_map{'section'};
156	}
157
158	my $doc_section = 0; # just for this document
159
160	my $text = "";
161	$text .= $documenttag;
162	# get the text for this document
163	my $section = $doc_obj->get_top_section();
164	while (defined $section)
165	{
166	# update a few statistics
167	$doc_section++;
168	$self->{'num_sections'}++;
169
170	my $sec_gs2_id = $self->{'num_sections'};
171	my $sec_gs2_docOID = $gs2_docOID;
172	$sec_gs2_docOID .= ".$section" if ($section ne "");
173
174	# if we are doing subcollections, then some docs shouldn't be indexed.
175	# but we need to put the section tag placeholders in there so the
176	# sections match up with database
177	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
178	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
179	if ($sec_tag_name ne "") {
180	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
181	$text .= "\n</$sec_tag_name>\n"
182	}
183	$section = $doc_obj->get_next_section($section);
184	next;
185	}
186
187	if ($sec_tag_name ne "")
188	{
189	$text .= "\n<$sec_tag_name gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
190	}
191
192	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
193	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
194	}
195	else {
196	# delete
197	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
198	}
199
200
201	# collect up all the text for allfields index in here (if there is one)
202	my $allfields_text = "";
203
204	foreach my $field (split (/;/, $fields)) {
205
206	# only deal with this field if it doesn't start with top or
207	# this is the first section
208	my $real_field = $field;
209	next if (($real_field =~ s/^top//) && ($doc_section != 1));
210
211	# process these two later
212	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
213
214	#individual metadata and or text specified - could be a comma separated list
215	#$specified_fields->{$real_field} = 1;
216	my $shortname="";
217	my $new_field = 0; # have we found a new field name?
218	if (defined $self->{'fieldnamemap'}->{$real_field}) {
219	$shortname = $self->{'fieldnamemap'}->{$real_field};
220	} else {
221	$shortname = $self->create_shortname($real_field);
222	$self->{'fieldnamemap'}->{$real_field} = $shortname;
223	$self->{'fieldnamemap'}->{$shortname} = 1;
224	}
225	my @metadata_list = (); # put any metadata values in here
226	my $section_text = ""; # put the text in here
227	foreach my $submeta (split /,/, $real_field) {
228	if ($submeta eq "text") {
229	# no point in indexing text more than once
230	if ($section_text eq "") {
231	$section_text = $doc_obj->get_text($section);
232	if ($self->{'indexing_text'}) {
233	# we always strip html
234	$section_text = $self->preprocess_text($section_text, 1, "");
235	}
236	else {
237	# leave html stuff in, but escape the tags
238	&ghtml::htmlsafe($section_text);
239	}
240	}
241	}
242	else {
243	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
244
245	# its a metadata element
246	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
247	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
248	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
249	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
250	}
251	}
252	push (@metadata_list, @section_metadata);
253	}
254	} # for each field in this one index
255
256
257	# now we add the text and/or metadata into new_text
258	if ($section_text ne "" \|\| scalar(@metadata_list)) {
259	my $new_text = "";
260
261	if ($section_text ne "") {
262	$new_text .= "$section_text ";
263	}
264
265	foreach my $item (@metadata_list) {
266	&ghtml::htmlsafe($item);
267	$new_text .= "$item ";
268	}
269
270	if ($self->{'allfields_index'}) {
271	$allfields_text .= $new_text;
272	}
273
274	if ($self->{'indexing_text'}) {
275	# add the tag
276	$new_text = "<$shortname index=\"1\">$new_text</$shortname>";
277	$self->{'allindexfields'}->{$real_field} = 1;
278	}
279	# filter the text
280	$new_text = $self->filter_text ($field, $new_text);
281
282	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
283	$self->{'num_processed_bytes'} += length ($new_text);
284	$text .= "$new_text";
285	}
286	else {
287	# delete
288	$self->{'num_processed_bytes'} -= length ($new_text);
289	}
290	}
291
292	} # foreach field
293
294	if ($self->{'all_metadata_specified'}) {
295
296	my $new_text = "";
297	my $shortname = "";
298	my $metadata = $doc_obj->get_all_metadata ($section);
299	foreach my $pair (@$metadata) {
300	my ($mfield, $mvalue) = (@$pair);
301	# no value
302	next unless defined $mvalue && $mvalue ne "";
303	# we have already indexed this
304	next if defined ($self->{'specified_fields'}->{$mfield});
305	# check fields here, maybe others dont want - change to use dontindex!!
306	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
307	next if ($mfield =~ /^gsdl/);
308
309	&ghtml::htmlsafe($mvalue);
310
311	if (defined $self->{'fieldnamemap'}->{$mfield}) {
312	$shortname = $self->{'fieldnamemap'}->{$mfield};
313	}
314	else {
315	$shortname = $self->create_shortname($mfield);
316	$self->{'fieldnamemap'}->{$mfield} = $shortname;
317	$self->{'fieldnamemap'}->{$shortname} = 1;
318	}
319	$self->{'allindexfields'}->{$mfield} = 1;
320	$new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
321	if ($self->{'allfields_index'}) {
322	$allfields_text .= "$mvalue ";
323	}
324
325	if (!defined $self->{'extraindexfields'}->{$mfield}) {
326	$self->{'extraindexfields'}->{$mfield} = 1;
327	}
328
329	}
330	# filter the text
331	$new_text = $self->filter_text ("metadata", $new_text);
332
333	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
334	$self->{'num_processed_bytes'} += length ($new_text);
335	$text .= "$new_text";
336	}
337	else {
338	# delete
339	$self->{'num_processed_bytes'} -= length ($new_text);
340	}
341	}
342
343	if ($self->{'allfields_index'}) {
344
345	my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
346	# filter the text
347	$new_text = $self->filter_text ("allfields", $new_text);
348
349	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
350	$self->{'num_processed_bytes'} += length ($new_text);
351	$text .= "$new_text";
352	}
353	else {
354	# delete
355	$self->{'num_processed_bytes'} -= length ($new_text);
356	}
357	}
358	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
359	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
360	# add sort fields if there are any
361
362	foreach my $sfield (@{$self->{'sortfields'}}) {
363	# ignore special field rank
364	next if ($sfield eq "rank" \|\| $sfield eq "none");
365	my $sf_shortname;
366	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
367	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
368	}
369	else {
370	$sf_shortname = $self->create_sortfield_shortname($sfield);
371	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
372	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
373	}
374	my @metadata_list = (); # put any metadata values in here
375	foreach my $submeta (split /,/, $sfield) {
376	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
377
378	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
379	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
380	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
381	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
382	}
383	}
384	push (@metadata_list, @section_metadata);
385	}
386	my $new_text = "";
387	foreach my $item (@metadata_list) {
388	&ghtml::htmlsafe($item);
389	$new_text .= "$item";
390	}
391	if ($new_text =~ /\S/) {
392	$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
393	# filter the text???
394	$text .= "$new_text"; # add it to the main text block
395	$self->{'actualsortfields'}->{$sfield} = 1;
396	}
397	}
398	}
399	$text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
400
401	$section = $doc_obj->get_next_section($section);
402	} # for each section
403
404	#open (TEXTOUT, ">text.out");
405	#print TEXTOUT "$text\n$documentendtag";
406	#close TEXTOUT;
407
408	print $lucenehandle "$text\n$documentendtag";
409
410	## if ($edit_mode eq "delete") {
411	## print STDERR "$text\n$documentendtag";
412	## }
413
414	}
415
416	sub text {
417	my $self = shift (@_);
418	my ($doc_obj,$file) = @_;
419
420	$self->textedit($doc_obj,$file,"add");
421	}
422
423	sub textreindex
424	{
425	my $self = shift (@_);
426	my ($doc_obj,$file) = @_;
427
428	$self->textedit($doc_obj,$file,"update");
429	}
430
431	sub textdelete
432	{
433	my $self = shift (@_);
434	my ($doc_obj,$file) = @_;
435
436	$self->textedit($doc_obj,$file,"delete");
437	}
438
439
440
441
442
443	# /** We make this builder pretend to be a document processor so we can get
444	# * information back from the plugins.
445	# *
446	# * @param $self A reference to this Lucene builder
447	# * @param $doc_obj A reference to a document object representing what was
448	# * parsed by the GAPlug
449	# * @param $file The name of the file parsed as a string
450	# *
451	# * @author John Thompson, DL Consulting Ltd
452	# */
453	sub process()
454	{
455	my $self = shift (@_);
456	my ($doc_obj, $file) = @_;
457
458	# If this is called from any stage other than an incremental infodb we want
459	# to pass through to the superclass of build
460	if ($self->get_mode() eq "incinfodb")
461	{
462	print STDERR "* Processing a document added using INCINFODB *\n" if ($self->{'verbosity'} > 3);
463	my ($archivedir) = $file =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
464	$archivedir = "" unless defined $archivedir;
465	$archivedir =~ s/\\/\//g;
466	$archivedir =~ s/^\/+//;
467	$archivedir =~ s/\/+$//;
468
469	# Number of files
470	print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
471
472	# resolve the final filenames of the files associated with this document
473	$self->assoc_files ($doc_obj, $archivedir);
474
475	# is this a paged or a hierarchical document
476	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
477
478	# Determine the actual docnum by checking if we've processed any
479	# previous incrementally added documents. If so, carry on from there.
480	# Otherwise we set the counter to be the same as the number of
481	# sections encountered during the previous build
482	if ($self->{'numincdocs'} == 0)
483	{
484	$self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
485	}
486
487	my $section = $doc_obj->get_top_section ();
488	print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
489	my $doc_OID = $doc_obj->get_OID();
490	my $url = "";
491	while (defined $section)
492	{
493	print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
494	# Attach all the other metadata to this document
495	# output the fact that this document is a document (unless doctype
496	# has been set to something else from within a plugin
497	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
498	if (!defined $dtype \|\| $dtype !~ /\w/)
499	{
500	#$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
501	$doc_obj->add_utf8_metadata($section, "doctype", "doc");
502	}
503	# output whether this node contains text
504	if ($doc_obj->get_text_length($section) > 0)
505	{
506	$doc_obj->add_utf8_metadata($section, "hastxt", 1);
507	}
508	else
509	{
510	$doc_obj->add_utf8_metadata($section, "hastxt", 0);
511	}
512
513	# output archivedir if at top level
514	if ($section eq $doc_obj->get_top_section())
515	{
516	$doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
517	$doc_obj->add_utf8_metadata($section, "thistype", $thistype);
518	}
519
520	# output a list of children
521	my $children = $doc_obj->get_children ($section);
522	if (scalar(@$children) > 0)
523	{
524	$doc_obj->add_utf8_metadata($section, "childtype", $childtype);
525	my @contains = ();
526	foreach my $child (@$children)
527	{
528	if ($child =~ /^.*?\.(\d+)$/)
529	{
530	push (@contains, "\".$1");
531	}
532	else
533	{
534	push (@contains, "\".$child");
535	}
536	}
537	$doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
538	}
539	#output the matching doc number
540	print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
541	$doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
542
543	$self->{'numincdocs'}++;
544	$section = $doc_obj->get_next_section($section);
545	# if no sections wanted, only add the docs
546	last if ($self->{'db_level'} eq "document");
547	}
548	print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
549	&IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
550	}
551	else
552	{
553	$self->mgppbuildproc::process(@_);
554	}
555	}
556	# / process() /
557
558
559	# Following methods seem to be no different to those defined in basebuildproc.pm
560	# From inspection, it looks like these ones can be removed
561
562
563	sub get_num_docs {
564	my $self = shift (@_);
565	#rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
566	return $self->{'num_docs'};
567	}
568
569	sub get_num_sections {
570	my $self = shift (@_);
571	#rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
572	return $self->{'num_sections'};
573	}
574
575	# num_bytes is the actual number of bytes in the collection
576	# this is normally the same as what's processed during text compression
577	sub get_num_bytes {
578	my $self = shift (@_);
579	#rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
580	return $self->{'num_bytes'};
581	}
582
583
584	# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
585	# Otherwise the removal of tags below might lead to Lucene turning
586	# "...farming</p>\n<p>EDWARD.." into "farmingedward"
587	# (example from demo collection b20cre)
588	# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
589	sub preprocess_text
590	{
591	my $self = shift (@_);
592	my ($text, $strip_html, $para) = @_;
593	# at this stage, we do not do paragraph tags unless have strip_html -
594	# it will result in a huge mess of non-xml
595	return unless $strip_html;
596
597	my $new_text = $text;
598
599	# if we have <pre> tags, we can have < > inside them, need to delete
600	# the <> before stripping tags
601	$new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
602
603	if ($para eq "") {
604	# just remove all tags
605	$new_text =~ s/<[^>]*>/ /gs;
606	} else {
607	# strip all tags except <p> tags which get turned into $para
608	$new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
609	}
610
611	# It's important that we remove name entities because otherwise the text passed to Lucene for indexing
612	# may not be valid XML (eg. if HTML-only entities like   are used)
613	$new_text =~ s/&\w{1,10};//g;
614	# Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
615	$new_text =~ s/&([^\#])/ $1/g;
616
617	return $new_text;
618	}
619
620	sub delete_assoc_files
621	{
622	my $self = shift (@_);
623	my ($archivedir, $edit_mode) = @_;
624
625	$self->basebuildproc::delete_assoc_files(@_);
626
627	if ($edit_mode eq "delete") {
628	# if we are deleting the doc, then also delete the lucene text version
629	my $assoc_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"text", $archivedir);
630	if (-d $assoc_dir) {
631	&FileUtils::removeFilesRecursive($assoc_dir);
632	}
633	}
634	}
635
636	sub create_sortfield_shortname {
637	my $self = shift(@_);
638
639	my ($realname) = @_;
640
641	my $index_shortname;
642	# if we have created a shortname for an index on this field, then use it.
643	if (defined $self->{'fieldnamemap'}->{$realname}) {
644	$index_shortname = $self->{'fieldnamemap'}->{$realname};
645	} else {
646	$index_shortname = $self->create_shortname($realname);
647	}
648	return "by".$index_shortname;
649	}
650
651
652	1;
653
654

Note: See TracBrowser for help on using the repository browser.

Download in other formats: