Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 25846

Last change on this file since 25846 was 25846, checked in by sjm84, 12 years ago
Some fixes and additions to the Solr perl code
File size: 16.1 KB

Line
1	###########################################################################
2	#
3	# solrbuildproc.pm -- perl wrapper for building index with Solr
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package solrbuildproc;
27
28	# This document processor outputs a document for solr to process
29
30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31	# whose use was then extended to Lucene, Solr has its own XML syntax:
32	#
33	# http://wiki.apache.org/solr/UpdateXmlMessages
34	#
35	# Using this means we don't need to write SolrWrapper.jar, as had to be
36	# done for Lucene, translating the XML syntax piped to it into appropriate
37	# calls to the Lucene API
38
39
40	use lucenebuildproc;
41	use ghtml;
42	use strict;
43	no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46	use IncrementalBuildUtils;
47
48	sub BEGIN {
49	@solrbuildproc::ISA = ('lucenebuildproc');
50	}
51
52
53	sub new {
54	my $class = shift @_;
55	my $self = new lucenebuildproc (@_);
56
57	return bless $self, $class;
58	}
59
60
61	#----
62
63	sub index_field_mapping_edit {
64	my $self = shift (@_);
65	my ($doc_obj,$file,$edit_mode) = @_;
66
67	# Only add/update gets to here
68	# Currently there is no need to distinguish between these edit modes
69
70	my $outhandle = $self->{'outhandle'};
71
72	# only study this document if it is one to be indexed
73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77	# get the parameters for the output
78	# split on : just in case there is subcoll and lang stuff
79	my ($fields) = split (/:/, $self->{'index'});
80
81	my $doc_section = 0; # just for this document
82
83	# get the text for this document
84	my $section = $doc_obj->get_top_section();
85
86	while (defined $section)
87	{
88	$doc_section++;
89
90	# if we are doing subcollections, then some docs shouldn't be
91	# considered for indexing
92
93	my $indexed_section
94	= $doc_obj->get_metadata_element($section, "gsdldoctype")
95	\|\| "indexed_section";
96
97	if (($indexed_doc == 0)
98	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99	$section = $doc_obj->get_next_section($section);
100	next;
101	}
102
103	# has the user added a 'metadata' index?
104	my $all_metadata_specified = 0;
105
106	# which fields have already been indexed?
107	# (same as fields, but in a map)
108	my $specified_fields = {};
109
110	# do we have an allfields index??
111	my $allfields_index = 0;
112
113	# collect up all the text for it in here
114	my $allfields_text = "";
115
116	foreach my $field (split (/;/, $fields)) {
117	if ($field eq "allfields") {
118	$allfields_index = 1;
119	} elsif ($field eq "metadata") {
120	$all_metadata_specified = 1;
121	}
122	}
123
124	foreach my $field (split (/;/, $fields)) {
125
126	# only deal with this field if it doesn't start with top or
127	# this is the first section
128	my $real_field = $field;
129	next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131	# process these two later
132	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
133
134	# individual metadata and or text specified
135	# -- could be a comma separated list
136	$specified_fields->{$real_field} = 1;
137
138	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139	my $shortname = $self->create_shortname($real_field);
140	$self->{'indexfieldmap'}->{$real_field} = $shortname;
141	$self->{'indexfieldmap'}->{$shortname} = 1;
142	}
143	} # foreach field
144
145
146	if ($all_metadata_specified) {
147
148	my $new_text = "";
149	my $shortname = "";
150	my $metadata = $doc_obj->get_all_metadata ($section);
151
152	foreach my $pair (@$metadata) {
153	my ($mfield, $mvalue) = (@$pair);
154
155	# no value
156	next unless defined $mvalue && $mvalue ne "";
157
158	# we have already indexed this
159	next if defined ($specified_fields->{$mfield});
160
161	# check fields here, maybe others dont want - change to use dontindex!!
162	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
163	next if ($mfield =~ /^gsdl/);
164
165	if (defined $self->{'indexfieldmap'}->{$mfield}) {
166	$shortname = $self->{'indexfieldmap'}->{$mfield};
167	}
168	else {
169	$shortname = $self->create_shortname($mfield);
170	$self->{'indexfieldmap'}->{$mfield} = $shortname;
171	$self->{'indexfieldmap'}->{$shortname} = 1;
172	}
173
174	if (!defined $self->{'indexfields'}->{$mfield}) {
175	$self->{'indexfields'}->{$mfield} = 1;
176	}
177	}
178	}
179
180	if ($allfields_index) {
181	# add the index name mapping
182	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183	$self->{'indexfieldmap'}->{"ZZ"} = 1;
184	}
185
186	$section = $doc_obj->get_next_section($section);
187
188	} # while defined section
189
190
191	}
192
193	sub index_field_mapping {
194	my $self = shift (@_);
195	my ($doc_obj,$file) = @_;
196
197	$self->index_field_mapping_edit($doc_obj,$file,"add");
198	}
199
200	sub index_field_mappingreindex
201	{
202	my $self = shift (@_);
203	my ($doc_obj,$file) = @_;
204
205	$self->index_field_mapping_edit($doc_obj,$file,"update");
206	}
207
208	sub index_field_mappingdelete
209	{
210	my $self = shift (@_);
211	my ($doc_obj,$file) = @_;
212
213	return; # nothing to be done
214	}
215
216
217	#----
218
219	sub textedit {
220	my $self = shift (@_);
221	my ($doc_obj,$file,$edit_mode) = @_;
222
223
224	if (!$self->get_indexing_text()) {
225	# In text-compress mode:
226	# => want document to be output in the simple <Doc>..</Doc> as is
227	# done by its super-class
228	return $self->SUPER::textedit(@_);
229	}
230
231	# "update" for $edit_mode near identical to "add" as we use Solr in its
232	# default mode of replacing an existing document if the new document
233	# has the same doc id. Main area of difference between "add" and "update"
234	# is that we do not update our 'stats' for number of documents or number
235	# of bytes processed. The latter is inaccurate, but considered better
236	# than allowing the value to steadily climb.
237
238
239	my $solrhandle = $self->{'output_handle'};
240	my $outhandle = $self->{'outhandle'};
241
242	# only output this document if it is one to be indexed
243	return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245	# skip this document if in "compress-text" mode and asked to delete it
246	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250	# this is another document
251	if ($edit_mode eq "add") {
252	$self->{'num_docs'} += 1;
253	}
254	elsif ($edit_mode eq "delete") {
255	$self->{'num_docs'} -= 1;
256	}
257
258	# get the parameters for the output
259	# split on : just in case there is subcoll and lang stuff
260	my ($fields) = split (/:/, $self->{'index'});
261
262	my $levels = $self->{'levels'};
263	my $ldoc_level = $levels->{'document'};
264	my $lsec_level = $levels->{'section'};
265
266	my $gs2_docOID = $doc_obj->get_OID();
267
268	my $start_doc;
269	my $end_doc;
270
271	if ($edit_mode eq "add") {
272	$start_doc = " <add>\n";
273	$start_doc .= " <doc>\n";
274	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
275
276	$end_doc = " </doc>\n";
277	$end_doc .= " </add>\n";
278	}
279	else {
280	$start_doc = " <delete>\n";
281	$start_doc .= " <id>$gs2_docOID</id>\n";
282
283	$end_doc = " </delete>\n";
284	}
285
286	# add/update, delete
287
288	my $sec_tag_name = "";
289	if ($lsec_level)
290	{
291	$sec_tag_name = $mgppbuildproc::level_map{'section'};
292	}
293
294	my $doc_section = 0; # just for this document
295
296	# only output if working with doc level
297	# my $text = undef;
298
299	my $text = ($sec_tag_name eq "") ? $start_doc : "";
300
301	# my $text = $start_doc if ($sec_tag_name eq "");
302
303	# get the text for this document
304	my $section = $doc_obj->get_top_section();
305
306	while (defined $section)
307	{
308	# update a few statistics
309	$doc_section++;
310	$self->{'num_sections'}++;
311
312	my $sec_gs2_id = $self->{'num_sections'};
313	my $sec_gs2_docOID = $gs2_docOID;
314	$sec_gs2_docOID .= ".$section" if ($section ne "");
315
316	my $start_sec;
317	my $end_sec;
318
319	if ($edit_mode eq "add") {
320	$start_sec = " <add>\n";
321	$start_sec .= " <doc>\n";
322	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
323
324	$end_sec = " </doc>\n";
325	$end_sec .= " </add>\n";
326	}
327	else {
328	$start_sec = " <delete>\n";
329	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
330
331	$end_sec = " </delete>\n";
332	}
333
334
335	# if we are doing subcollections, then some docs shouldn't be indexed.
336	# but we need to put the section tag placeholders in there so the
337	# sections match up with database
338	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
339	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
340	if ($sec_tag_name ne "") {
341	$text .= $start_sec;
342	$text .= $end_sec;
343	}
344	$section = $doc_obj->get_next_section($section);
345	next;
346	}
347
348	# add in start section tag if indexing at the section level
349	$text .= $start_sec if ($sec_tag_name ne "");
350
351	if ($edit_mode eq "add") {
352	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
353	}
354	elsif ($edit_mode eq "delete") {
355	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
356	}
357
358
359	# has the user added a 'metadata' index?
360	my $all_metadata_specified = 0;
361	# which fields have already been indexed? (same as fields, but in a map)
362	my $specified_fields = {};
363
364	# do we have an allfields index??
365	my $allfields_index = 0;
366	# collect up all the text for it in here
367	my $allfields_text = "";
368	foreach my $field (split (/;/, $fields)) {
369	if ($field eq "allfields") {
370	$allfields_index = 1;
371	} elsif ($field eq "metadata") {
372	$all_metadata_specified = 1;
373	}
374	}
375
376	foreach my $field (split (/;/, $fields)) {
377
378	# only deal with this field if it doesn't start with top or
379	# this is the first section
380	my $real_field = $field;
381	next if (($real_field =~ s/^top//) && ($doc_section != 1));
382
383	# process these two later
384	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
385
386	#individual metadata and or text specified - could be a comma separated list
387	$specified_fields->{$real_field} = 1;
388	my $shortname="";
389	my $new_field = 0; # have we found a new field name?
390	if (defined $self->{'indexfieldmap'}->{$real_field}) {
391	$shortname = $self->{'indexfieldmap'}->{$real_field};
392	}
393	else {
394	$shortname = $self->create_shortname($real_field);
395	$new_field = 1;
396	}
397
398	my @metadata_list = (); # put any metadata values in here
399	my $section_text = ""; # put the text in here
400	foreach my $submeta (split /,/, $real_field) {
401	if ($submeta eq "text") {
402	# no point in indexing text more than once
403	if ($section_text eq "") {
404	$section_text = $doc_obj->get_text($section);
405	if ($self->{'indexing_text'}) {
406	# we always strip html
407	$section_text = $self->preprocess_text($section_text, 1, "");
408	}
409	else {
410	# leave html stuff in, but escape the tags
411	&ghtml::htmlsafe($section_text);
412	}
413	}
414	}
415	else {
416	$submeta =~ s/^ex\.//; #strip off ex.
417
418	# its a metadata element
419	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
420	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
421	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
422	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
423	}
424	}
425	push (@metadata_list, @section_metadata);
426	}
427	} # for each field in this one index
428
429	# now we add the text and/or metadata into new_text
430	if ($section_text ne "" \|\| scalar(@metadata_list)) {
431	my $new_text = "";
432
433	if ($section_text ne "") {
434	$new_text .= "$section_text ";
435	}
436
437	foreach my $item (@metadata_list) {
438	&ghtml::htmlsafe($item);
439	$new_text .= "$item ";
440	}
441
442	if ($allfields_index) {
443	$allfields_text .= $new_text;
444	}
445
446	# Remove any leading or trailing white space
447	$new_text =~ s/\s+$//;
448	$new_text =~ s/^\s+//;
449
450
451	if ($self->{'indexing_text'}) {
452	# add the tag
453	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
454	}
455	# filter the text
456	$new_text = $self->filter_text ($field, $new_text);
457
458	if ($edit_mode eq "add") {
459	$self->{'num_processed_bytes'} += length ($new_text);
460	$text .= "$new_text";
461	}
462	elsif ($edit_mode eq "update") {
463	$text .= "$new_text";
464	}
465	elsif ($edit_mode eq "delete") {
466	$self->{'num_processed_bytes'} -= length ($new_text);
467	}
468
469
470	if ($self->{'indexing_text'} && $new_field) {
471	# we need to add to the list in indexfields
472
473	$self->{'indexfieldmap'}->{$real_field} = $shortname;
474	$self->{'indexfieldmap'}->{$shortname} = 1;
475	}
476
477	}
478
479	} # foreach field
480
481
482	if ($all_metadata_specified) {
483
484	my $new_text = "";
485	my $shortname = "";
486	my $metadata = $doc_obj->get_all_metadata ($section);
487	foreach my $pair (@$metadata) {
488	my ($mfield, $mvalue) = (@$pair);
489
490	# no value
491	next unless defined $mvalue && $mvalue ne "";
492
493	# we have already indexed this
494	next if defined ($specified_fields->{$mfield});
495
496	# check fields here, maybe others dont want - change to use dontindex!!
497	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
498	next if ($mfield =~ /^gsdl/);
499
500	&ghtml::htmlsafe($mvalue);
501
502	if (defined $self->{'indexfieldmap'}->{$mfield}) {
503	$shortname = $self->{'indexfieldmap'}->{$mfield};
504	}
505	else {
506	$shortname = $self->create_shortname($mfield);
507	$self->{'indexfieldmap'}->{$mfield} = $shortname;
508	$self->{'indexfieldmap'}->{$shortname} = 1;
509	}
510	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
511	if ($allfields_index) {
512	$allfields_text .= "$mvalue ";
513	}
514
515	if (!defined $self->{'indexfields'}->{$mfield}) {
516	$self->{'indexfields'}->{$mfield} = 1;
517	}
518
519	}
520	# filter the text
521	$new_text = $self->filter_text ("metadata", $new_text);
522
523	if ($edit_mode eq "add") {
524	$self->{'num_processed_bytes'} += length ($new_text);
525	$text .= "$new_text";
526	}
527	elsif ($edit_mode eq "update") {
528	$text .= "$new_text";
529	}
530	elsif ($edit_mode eq "delete") {
531	$self->{'num_processed_bytes'} -= length ($new_text);
532	}
533	}
534
535	if ($allfields_index) {
536	# add the index name mapping
537	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
538	$self->{'indexfieldmap'}->{"ZZ"} = 1;
539
540	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
541	# filter the text
542	$new_text = $self->filter_text ("allfields", $new_text);
543
544	if ($edit_mode eq "add") {
545	$self->{'num_processed_bytes'} += length ($new_text);
546	$text .= "$new_text";
547	}
548	elsif ($edit_mode eq "update") {
549	$text .= "$new_text";
550	}
551	elsif ($edit_mode eq "delete") {
552	$self->{'num_processed_bytes'} -= length ($new_text);
553	}
554	}
555
556	# add in end tag if at top-level doc root, or indexing at the section level
557	$text .= $end_sec if ($sec_tag_name ne "");
558
559	$section = $doc_obj->get_next_section($section);
560	} # while defined section
561
562
563	# only output if working with doc level
564	$text .= $end_doc if ($sec_tag_name eq "");
565
566	## $text .= "<commit/>\n";
567
568	print $solrhandle $text;
569
570	}
571
572
573
574
575	sub textreindex
576	{
577	my $self = shift (@_);
578	my ($doc_obj,$file) = @_;
579
580	$self->textedit($doc_obj,$file,"update");
581	}
582
583
584	1;
585
586

Note: See TracBrowser for help on using the repository browser.

Download in other formats: