Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 28127

Last change on this file since 28127 was 28127, checked in by davidb, 11 years ago
Commenting out code that looks like it was meant only for debugging
File size: 18.7 KB

Line
1	###########################################################################
2	#
3	# solrbuildproc.pm -- perl wrapper for building index with Solr
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package solrbuildproc;
27
28	# This document processor outputs a document for solr to process
29
30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31	# whose use was then extended to Lucene, Solr has its own XML syntax:
32	#
33	# http://wiki.apache.org/solr/UpdateXmlMessages
34	#
35	# Using this means we don't need to write SolrWrapper.jar, as had to be
36	# done for Lucene, translating the XML syntax piped to it into appropriate
37	# calls to the Lucene API
38
39
40	use lucenebuildproc;
41	use ghtml;
42	use strict;
43	no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46	use IncrementalBuildUtils;
47
48	sub BEGIN {
49	@solrbuildproc::ISA = ('lucenebuildproc');
50	}
51
52
53	sub new {
54	my $class = shift @_;
55	my $self = new lucenebuildproc (@_);
56
57	return bless $self, $class;
58	}
59
60	sub set_facetfields {
61	my $self = shift (@_);
62
63	my ($facetfields) = @_;
64	$self->{'facetfields'} = ();
65	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66	foreach my $s (@$facetfields) {
67	if ($s !~ /^(text\|allfields\|metadata)$/) {
68	push (@{$self->{'facetfields'}}, $s);
69	}
70	}
71	}
72
73	#----
74
75	sub index_field_mapping_edit {
76	my $self = shift (@_);
77	my ($doc_obj,$file,$edit_mode) = @_;
78
79	# Only add/update gets to here
80	# Currently there is no need to distinguish between these edit modes
81
82	my $outhandle = $self->{'outhandle'};
83
84	# only study this document if it is one to be indexed
85	return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89	# get the parameters for the output
90	# split on : just in case there is subcoll and lang stuff
91	my ($fields) = split (/:/, $self->{'index'});
92
93	my $doc_section = 0; # just for this document
94
95	# get the text for this document
96	my $section = $doc_obj->get_top_section();
97
98	while (defined $section)
99	{
100	$doc_section++;
101
102	# if we are doing subcollections, then some docs shouldn't be
103	# considered for indexing
104
105	my $indexed_section
106	= $doc_obj->get_metadata_element($section, "gsdldoctype")
107	\|\| "indexed_section";
108
109	if (($indexed_doc == 0)
110	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111	$section = $doc_obj->get_next_section($section);
112	next;
113	}
114
115	# has the user added a 'metadata' index?
116	my $all_metadata_specified = 0;
117
118	# which fields have already been indexed?
119	# (same as fields, but in a map)
120	my $specified_fields = {};
121
122	# do we have an allfields index??
123	my $allfields_index = 0;
124
125	# collect up all the text for it in here
126	my $allfields_text = "";
127
128	foreach my $field (split (/;/, $fields)) {
129	if ($field eq "allfields") {
130	$allfields_index = 1;
131	} elsif ($field eq "metadata") {
132	$all_metadata_specified = 1;
133	}
134	}
135
136	foreach my $field (split (/;/, $fields)) {
137
138	# only deal with this field if it doesn't start with top or
139	# this is the first section
140	my $real_field = $field;
141	next if (($real_field =~ s/^top//) && ($doc_section != 1));
142
143	# process these two later
144	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
145
146	# individual metadata and or text specified
147	# -- could be a comma separated list
148	$specified_fields->{$real_field} = 1;
149
150	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151	my $shortname = $self->create_shortname($real_field);
152	$self->{'indexfieldmap'}->{$real_field} = $shortname;
153	$self->{'indexfieldmap'}->{$shortname} = 1;
154	}
155	} # foreach field
156
157
158	if ($all_metadata_specified) {
159
160	my $new_text = "";
161	my $shortname = "";
162	my $metadata = $doc_obj->get_all_metadata ($section);
163
164	foreach my $pair (@$metadata) {
165	my ($mfield, $mvalue) = (@$pair);
166
167	# no value
168	next unless defined $mvalue && $mvalue ne "";
169
170	# we have already indexed this
171	next if defined ($specified_fields->{$mfield});
172
173	# check fields here, maybe others dont want - change to use dontindex!!
174	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
175	next if ($mfield =~ /^gsdl/);
176
177	if (defined $self->{'indexfieldmap'}->{$mfield}) {
178	$shortname = $self->{'indexfieldmap'}->{$mfield};
179	}
180	else {
181	$shortname = $self->create_shortname($mfield);
182	$self->{'indexfieldmap'}->{$mfield} = $shortname;
183	$self->{'indexfieldmap'}->{$shortname} = 1;
184	}
185
186	if (!defined $self->{'indexfields'}->{$mfield}) {
187	$self->{'indexfields'}->{$mfield} = 1;
188	}
189	}
190	}
191
192	if ($allfields_index) {
193	# add the index name mapping
194	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195	$self->{'indexfieldmap'}->{"ZZ"} = 1;
196	}
197
198	$section = $doc_obj->get_next_section($section);
199
200	} # while defined section
201
202
203	}
204
205	sub index_field_mapping {
206	my $self = shift (@_);
207	my ($doc_obj,$file) = @_;
208
209	$self->index_field_mapping_edit($doc_obj,$file,"add");
210	}
211
212	sub index_field_mappingreindex
213	{
214	my $self = shift (@_);
215	my ($doc_obj,$file) = @_;
216
217	$self->index_field_mapping_edit($doc_obj,$file,"update");
218	}
219
220	sub index_field_mappingdelete
221	{
222	my $self = shift (@_);
223	my ($doc_obj,$file) = @_;
224
225	return; # nothing to be done
226	}
227
228
229	#----
230
231	sub textedit {
232	my $self = shift (@_);
233	my ($doc_obj,$file,$edit_mode) = @_;
234
235
236	if (!$self->get_indexing_text()) {
237	# In text-compress mode:
238	# => want document to be output in the simple <Doc>..</Doc> as is
239	# done by its super-class
240	return $self->SUPER::textedit(@_);
241	}
242
243	# "update" for $edit_mode near identical to "add" as we use Solr in its
244	# default mode of replacing an existing document if the new document
245	# has the same doc id. Main area of difference between "add" and "update"
246	# is that we do not update our 'stats' for number of documents or number
247	# of bytes processed. The latter is inaccurate, but considered better
248	# than allowing the value to steadily climb.
249
250
251	my $solrhandle = $self->{'output_handle'};
252	my $outhandle = $self->{'outhandle'};
253
254	# only output this document if it is one to be indexed
255	return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257	# skip this document if in "compress-text" mode and asked to delete it
258	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262	# this is another document
263	if ($edit_mode eq "add") {
264	$self->{'num_docs'} += 1;
265	}
266	elsif ($edit_mode eq "delete") {
267	$self->{'num_docs'} -= 1;
268	}
269
270	# get the parameters for the output
271	# split on : just in case there is subcoll and lang stuff
272	my ($fields) = split (/:/, $self->{'index'});
273
274	my $levels = $self->{'levels'};
275	my $ldoc_level = $levels->{'document'};
276	my $lsec_level = $levels->{'section'};
277
278	my $gs2_docOID = $doc_obj->get_OID();
279
280	my $start_doc;
281	my $end_doc;
282
283	if ($edit_mode eq "add") {
284	$start_doc = " <add>\n";
285	$start_doc .= " <doc>\n";
286	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
287
288	$end_doc = " </doc>\n";
289	$end_doc .= " </add>\n";
290	}
291	else {
292	$start_doc = " <delete>\n";
293	$start_doc .= " <id>$gs2_docOID</id>\n";
294
295	$end_doc = " </delete>\n";
296	}
297
298	# add/update, delete
299
300	my $sec_tag_name = "";
301	if ($lsec_level)
302	{
303	$sec_tag_name = $mgppbuildproc::level_map{'section'};
304	}
305
306	my $doc_section = 0; # just for this document
307
308	# only output if working with doc level
309	# my $text = undef;
310
311	my $text = ($sec_tag_name eq "") ? $start_doc : "";
312
313	# my $text = $start_doc if ($sec_tag_name eq "");
314
315	# get the text for this document
316	my $section = $doc_obj->get_top_section();
317
318	while (defined $section)
319	{
320	# update a few statistics
321	$doc_section++;
322	$self->{'num_sections'}++;
323
324	my $sec_gs2_id = $self->{'num_sections'};
325	my $sec_gs2_docOID = $gs2_docOID;
326	$sec_gs2_docOID .= ".$section" if ($section ne "");
327
328	my $start_sec;
329	my $end_sec;
330
331	if ($edit_mode eq "add") {
332	$start_sec = " <add>\n";
333	$start_sec .= " <doc>\n";
334	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
335
336	$end_sec = " </doc>\n";
337	$end_sec .= " </add>\n";
338	}
339	else {
340	$start_sec = " <delete>\n";
341	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
342
343	$end_sec = " </delete>\n";
344	}
345
346
347	# if we are doing subcollections, then some docs shouldn't be indexed.
348	# but we need to put the section tag placeholders in there so the
349	# sections match up with database
350	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
351	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
352	if ($sec_tag_name ne "") {
353	$text .= $start_sec;
354	$text .= $end_sec;
355	}
356	$section = $doc_obj->get_next_section($section);
357	next;
358	}
359
360	# add in start section tag if indexing at the section level
361	$text .= $start_sec if ($sec_tag_name ne "");
362
363	if ($edit_mode eq "add") {
364	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
365	}
366	elsif ($edit_mode eq "delete") {
367	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
368	}
369
370
371	# has the user added a 'metadata' index?
372	my $all_metadata_specified = 0;
373	# which fields have already been indexed? (same as fields, but in a map)
374	my $specified_fields = {};
375
376	# do we have an allfields index??
377	my $allfields_index = 0;
378	# collect up all the text for it in here
379	my $allfields_text = "";
380	foreach my $field (split (/;/, $fields)) {
381	if ($field eq "allfields") {
382	$allfields_index = 1;
383	} elsif ($field eq "metadata") {
384	$all_metadata_specified = 1;
385	}
386	}
387
388	foreach my $field (split (/;/, $fields)) {
389
390	# only deal with this field if it doesn't start with top or
391	# this is the first section
392	my $real_field = $field;
393	next if (($real_field =~ s/^top//) && ($doc_section != 1));
394
395	# process these two later
396	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
397
398	#individual metadata and or text specified - could be a comma separated list
399	$specified_fields->{$real_field} = 1;
400	my $shortname="";
401	my $new_field = 0; # have we found a new field name?
402	if (defined $self->{'indexfieldmap'}->{$real_field}) {
403	$shortname = $self->{'indexfieldmap'}->{$real_field};
404	}
405	else {
406	$shortname = $self->create_shortname($real_field);
407	$new_field = 1;
408	}
409
410	my @metadata_list = (); # put any metadata values in here
411	my $section_text = ""; # put the text in here
412	foreach my $submeta (split /,/, $real_field) {
413	if ($submeta eq "text") {
414	# no point in indexing text more than once
415	if ($section_text eq "") {
416	$section_text = $doc_obj->get_text($section);
417	if ($self->{'indexing_text'}) {
418	# we always strip html
419	$section_text = $self->preprocess_text($section_text, 1, "");
420	}
421	else {
422	# leave html stuff in, but escape the tags
423	&ghtml::htmlsafe($section_text);
424	}
425	}
426	}
427	else {
428	$submeta =~ s/^ex\.//; #strip off ex.
429
430	# its a metadata element
431	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
432	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
433	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
434	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
435	}
436	}
437	push (@metadata_list, @section_metadata);
438	}
439	} # for each field in this one index
440
441	# now we add the text and/or metadata into new_text
442	if ($section_text ne "" \|\| scalar(@metadata_list)) {
443	my $new_text = "";
444
445	if ($section_text ne "") {
446	$new_text .= "$section_text ";
447	}
448
449	foreach my $item (@metadata_list) {
450	&ghtml::htmlsafe($item);
451	$new_text .= "$item ";
452	}
453
454	if ($allfields_index) {
455	$allfields_text .= $new_text;
456	}
457
458	# Remove any leading or trailing white space
459	$new_text =~ s/\s+$//;
460	$new_text =~ s/^\s+//;
461
462
463	if ($self->{'indexing_text'}) {
464	# add the tag
465	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
466	}
467	# filter the text
468	$new_text = $self->filter_text ($field, $new_text);
469
470	if ($edit_mode eq "add") {
471	$self->{'num_processed_bytes'} += length ($new_text);
472	$text .= "$new_text";
473	}
474	elsif ($edit_mode eq "update") {
475	$text .= "$new_text";
476	}
477	elsif ($edit_mode eq "delete") {
478	$self->{'num_processed_bytes'} -= length ($new_text);
479	}
480
481
482	if ($self->{'indexing_text'} && $new_field) {
483	# we need to add to the list in indexfields
484
485	$self->{'indexfieldmap'}->{$real_field} = $shortname;
486	$self->{'indexfieldmap'}->{$shortname} = 1;
487	}
488
489	}
490
491	} # foreach field
492
493
494	if ($all_metadata_specified) {
495
496	my $new_text = "";
497	my $shortname = "";
498	my $metadata = $doc_obj->get_all_metadata ($section);
499	foreach my $pair (@$metadata) {
500	my ($mfield, $mvalue) = (@$pair);
501
502	# no value
503	next unless defined $mvalue && $mvalue ne "";
504
505	# we have already indexed this
506	next if defined ($specified_fields->{$mfield});
507
508	# check fields here, maybe others dont want - change to use dontindex!!
509	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
510	next if ($mfield =~ /^gsdl/);
511
512	&ghtml::htmlsafe($mvalue);
513
514	if (defined $self->{'indexfieldmap'}->{$mfield}) {
515	$shortname = $self->{'indexfieldmap'}->{$mfield};
516	}
517	else {
518	$shortname = $self->create_shortname($mfield);
519	$self->{'indexfieldmap'}->{$mfield} = $shortname;
520	$self->{'indexfieldmap'}->{$shortname} = 1;
521	}
522	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
523	if ($allfields_index) {
524	$allfields_text .= "$mvalue ";
525	}
526
527	if (!defined $self->{'indexfields'}->{$mfield}) {
528	$self->{'indexfields'}->{$mfield} = 1;
529	}
530
531	}
532	# filter the text
533	$new_text = $self->filter_text ("metadata", $new_text);
534
535	if ($edit_mode eq "add") {
536	$self->{'num_processed_bytes'} += length ($new_text);
537	$text .= "$new_text";
538	}
539	elsif ($edit_mode eq "update") {
540	$text .= "$new_text";
541	}
542	elsif ($edit_mode eq "delete") {
543	$self->{'num_processed_bytes'} -= length ($new_text);
544	}
545	}
546
547	if ($allfields_index) {
548	# add the index name mapping
549	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
550	$self->{'indexfieldmap'}->{"ZZ"} = 1;
551
552	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
553	# filter the text
554	$new_text = $self->filter_text ("allfields", $new_text);
555
556	if ($edit_mode eq "add") {
557	$self->{'num_processed_bytes'} += length ($new_text);
558	$text .= "$new_text";
559	}
560	elsif ($edit_mode eq "update") {
561	$text .= "$new_text";
562	}
563	elsif ($edit_mode eq "delete") {
564	$self->{'num_processed_bytes'} -= length ($new_text);
565	}
566	}
567
568	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
569	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
570	# add sort fields if there are any
571	my $seenfields = {};
572	foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
573	# ignore special field rank/none
574	next if $sfield eq "rank" \|\| $sfield eq "none";
575	# ignore any we have already done - we may have duplicates in the sort and facet lists
576	next if (defined $seenfields->{$sfield});
577	$seenfields->{$sfield} = 1;
578	my $sf_shortname;
579	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
580	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
581	}
582	else {
583	$sf_shortname = $self->create_sortfield_shortname($sfield);
584	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
585	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
586	}
587	my @metadata_list = (); # put any metadata values in here
588	foreach my $submeta (split /,/, $sfield) {
589	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
590
591	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
592	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
593	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
594	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
595	}
596	}
597	push (@metadata_list, @section_metadata);
598	}
599	my $new_text = "";
600	foreach my $item (@metadata_list) {
601	&ghtml::htmlsafe($item);
602	$new_text .= "$item ";
603	}
604	if ($new_text =~ /\S/) {
605	$new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
606	# filter the text???
607	$text .= "$new_text"; # add it to the main text block
608	$self->{'actualsortfields'}->{$sfield} = 1;
609	}
610	}
611	}
612
613	# add in end tag if at top-level doc root, or indexing at the section level
614	$text .= $end_sec if ($sec_tag_name ne "");
615
616	$section = $doc_obj->get_next_section($section);
617	} # while defined section
618
619
620	# only output if working with doc level
621	$text .= $end_doc if ($sec_tag_name eq "");
622
623	## $text .= "<commit/>\n";
624
625	# The following code looks like it's for debugging purposes, but
626	# committed by accident. Commenting out for now ...
627
628	# open(TEXTOUT, '>:utf8', "text.out");
629	# print TEXTOUT "$text";
630	# close TEXTOUT;
631
632	print $solrhandle $text;
633
634	}
635
636
637
638
639	sub textreindex
640	{
641	my $self = shift (@_);
642	my ($doc_obj,$file) = @_;
643
644	$self->textedit($doc_obj,$file,"update");
645	}
646
647
648	1;
649
650

Note: See TracBrowser for help on using the repository browser.

Download in other formats: