Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 27815

Last change on this file since 27815 was 27815, checked in by kjdon, 11 years ago
adding in facets
File size: 18.7 KB

Line
1	###########################################################################
2	#
3	# solrbuildproc.pm -- perl wrapper for building index with Solr
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package solrbuildproc;
27
28	# This document processor outputs a document for solr to process
29
30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31	# whose use was then extended to Lucene, Solr has its own XML syntax:
32	#
33	# http://wiki.apache.org/solr/UpdateXmlMessages
34	#
35	# Using this means we don't need to write SolrWrapper.jar, as had to be
36	# done for Lucene, translating the XML syntax piped to it into appropriate
37	# calls to the Lucene API
38
39
40	use lucenebuildproc;
41	use ghtml;
42	use strict;
43	no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46	use IncrementalBuildUtils;
47
48	sub BEGIN {
49	@solrbuildproc::ISA = ('lucenebuildproc');
50	}
51
52
53	sub new {
54	my $class = shift @_;
55	my $self = new lucenebuildproc (@_);
56
57	return bless $self, $class;
58	}
59
60	sub set_facetfields {
61	my $self = shift (@_);
62
63	my ($facetfields) = @_;
64	$self->{'facetfields'} = ();
65	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66	foreach my $s (@$facetfields) {
67	if ($s !~ /^(text\|allfields\|metadata)$/) {
68	push (@{$self->{'facetfields'}}, $s);
69	}
70	}
71	}
72
73	#----
74
75	sub index_field_mapping_edit {
76	my $self = shift (@_);
77	my ($doc_obj,$file,$edit_mode) = @_;
78
79	# Only add/update gets to here
80	# Currently there is no need to distinguish between these edit modes
81
82	my $outhandle = $self->{'outhandle'};
83
84	# only study this document if it is one to be indexed
85	return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89	# get the parameters for the output
90	# split on : just in case there is subcoll and lang stuff
91	my ($fields) = split (/:/, $self->{'index'});
92
93	my $doc_section = 0; # just for this document
94
95	# get the text for this document
96	my $section = $doc_obj->get_top_section();
97
98	while (defined $section)
99	{
100	$doc_section++;
101
102	# if we are doing subcollections, then some docs shouldn't be
103	# considered for indexing
104
105	my $indexed_section
106	= $doc_obj->get_metadata_element($section, "gsdldoctype")
107	\|\| "indexed_section";
108
109	if (($indexed_doc == 0)
110	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111	$section = $doc_obj->get_next_section($section);
112	next;
113	}
114
115	# has the user added a 'metadata' index?
116	my $all_metadata_specified = 0;
117
118	# which fields have already been indexed?
119	# (same as fields, but in a map)
120	my $specified_fields = {};
121
122	# do we have an allfields index??
123	my $allfields_index = 0;
124
125	# collect up all the text for it in here
126	my $allfields_text = "";
127
128	foreach my $field (split (/;/, $fields)) {
129	if ($field eq "allfields") {
130	$allfields_index = 1;
131	} elsif ($field eq "metadata") {
132	$all_metadata_specified = 1;
133	}
134	}
135
136	foreach my $field (split (/;/, $fields)) {
137
138	# only deal with this field if it doesn't start with top or
139	# this is the first section
140	my $real_field = $field;
141	next if (($real_field =~ s/^top//) && ($doc_section != 1));
142
143	# process these two later
144	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
145
146	# individual metadata and or text specified
147	# -- could be a comma separated list
148	$specified_fields->{$real_field} = 1;
149
150	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151	my $shortname = $self->create_shortname($real_field);
152	$self->{'indexfieldmap'}->{$real_field} = $shortname;
153	$self->{'indexfieldmap'}->{$shortname} = 1;
154	}
155	} # foreach field
156
157
158	if ($all_metadata_specified) {
159
160	my $new_text = "";
161	my $shortname = "";
162	my $metadata = $doc_obj->get_all_metadata ($section);
163
164	foreach my $pair (@$metadata) {
165	my ($mfield, $mvalue) = (@$pair);
166
167	# no value
168	next unless defined $mvalue && $mvalue ne "";
169
170	# we have already indexed this
171	next if defined ($specified_fields->{$mfield});
172
173	# check fields here, maybe others dont want - change to use dontindex!!
174	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
175	next if ($mfield =~ /^gsdl/);
176
177	if (defined $self->{'indexfieldmap'}->{$mfield}) {
178	$shortname = $self->{'indexfieldmap'}->{$mfield};
179	}
180	else {
181	$shortname = $self->create_shortname($mfield);
182	$self->{'indexfieldmap'}->{$mfield} = $shortname;
183	$self->{'indexfieldmap'}->{$shortname} = 1;
184	}
185
186	if (!defined $self->{'indexfields'}->{$mfield}) {
187	$self->{'indexfields'}->{$mfield} = 1;
188	}
189	}
190	}
191
192	if ($allfields_index) {
193	# add the index name mapping
194	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195	$self->{'indexfieldmap'}->{"ZZ"} = 1;
196	}
197
198	$section = $doc_obj->get_next_section($section);
199
200	} # while defined section
201
202
203	}
204
205	sub index_field_mapping {
206	my $self = shift (@_);
207	my ($doc_obj,$file) = @_;
208
209	$self->index_field_mapping_edit($doc_obj,$file,"add");
210	}
211
212	sub index_field_mappingreindex
213	{
214	my $self = shift (@_);
215	my ($doc_obj,$file) = @_;
216
217	$self->index_field_mapping_edit($doc_obj,$file,"update");
218	}
219
220	sub index_field_mappingdelete
221	{
222	my $self = shift (@_);
223	my ($doc_obj,$file) = @_;
224
225	return; # nothing to be done
226	}
227
228
229	#----
230
231	sub textedit {
232	my $self = shift (@_);
233	my ($doc_obj,$file,$edit_mode) = @_;
234
235
236	if (!$self->get_indexing_text()) {
237	# In text-compress mode:
238	# => want document to be output in the simple <Doc>..</Doc> as is
239	# done by its super-class
240	return $self->SUPER::textedit(@_);
241	}
242
243	# "update" for $edit_mode near identical to "add" as we use Solr in its
244	# default mode of replacing an existing document if the new document
245	# has the same doc id. Main area of difference between "add" and "update"
246	# is that we do not update our 'stats' for number of documents or number
247	# of bytes processed. The latter is inaccurate, but considered better
248	# than allowing the value to steadily climb.
249
250
251	my $solrhandle = $self->{'output_handle'};
252	my $outhandle = $self->{'outhandle'};
253
254	# only output this document if it is one to be indexed
255	return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257	# skip this document if in "compress-text" mode and asked to delete it
258	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262	# this is another document
263	if ($edit_mode eq "add") {
264	$self->{'num_docs'} += 1;
265	}
266	elsif ($edit_mode eq "delete") {
267	$self->{'num_docs'} -= 1;
268	}
269
270	# get the parameters for the output
271	# split on : just in case there is subcoll and lang stuff
272	my ($fields) = split (/:/, $self->{'index'});
273
274	my $levels = $self->{'levels'};
275	my $ldoc_level = $levels->{'document'};
276	my $lsec_level = $levels->{'section'};
277
278	my $gs2_docOID = $doc_obj->get_OID();
279
280	my $start_doc;
281	my $end_doc;
282
283	if ($edit_mode eq "add") {
284	$start_doc = " <add>\n";
285	$start_doc .= " <doc>\n";
286	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
287
288	$end_doc = " </doc>\n";
289	$end_doc .= " </add>\n";
290	}
291	else {
292	$start_doc = " <delete>\n";
293	$start_doc .= " <id>$gs2_docOID</id>\n";
294
295	$end_doc = " </delete>\n";
296	}
297
298	# add/update, delete
299
300	my $sec_tag_name = "";
301	if ($lsec_level)
302	{
303	$sec_tag_name = $mgppbuildproc::level_map{'section'};
304	}
305
306	my $doc_section = 0; # just for this document
307
308	# only output if working with doc level
309	# my $text = undef;
310
311	my $text = ($sec_tag_name eq "") ? $start_doc : "";
312
313	# my $text = $start_doc if ($sec_tag_name eq "");
314
315	# get the text for this document
316	my $section = $doc_obj->get_top_section();
317
318	while (defined $section)
319	{
320	# update a few statistics
321	$doc_section++;
322	$self->{'num_sections'}++;
323
324	my $sec_gs2_id = $self->{'num_sections'};
325	my $sec_gs2_docOID = $gs2_docOID;
326	$sec_gs2_docOID .= ".$section" if ($section ne "");
327
328	my $start_sec;
329	my $end_sec;
330
331	if ($edit_mode eq "add") {
332	$start_sec = " <add>\n";
333	$start_sec .= " <doc>\n";
334	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
335
336	$end_sec = " </doc>\n";
337	$end_sec .= " </add>\n";
338	}
339	else {
340	$start_sec = " <delete>\n";
341	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
342
343	$end_sec = " </delete>\n";
344	}
345
346
347	# if we are doing subcollections, then some docs shouldn't be indexed.
348	# but we need to put the section tag placeholders in there so the
349	# sections match up with database
350	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
351	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
352	if ($sec_tag_name ne "") {
353	$text .= $start_sec;
354	$text .= $end_sec;
355	}
356	$section = $doc_obj->get_next_section($section);
357	next;
358	}
359
360	# add in start section tag if indexing at the section level
361	$text .= $start_sec if ($sec_tag_name ne "");
362
363	if ($edit_mode eq "add") {
364	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
365	}
366	elsif ($edit_mode eq "delete") {
367	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
368	}
369
370
371	# has the user added a 'metadata' index?
372	my $all_metadata_specified = 0;
373	# which fields have already been indexed? (same as fields, but in a map)
374	my $specified_fields = {};
375
376	# do we have an allfields index??
377	my $allfields_index = 0;
378	# collect up all the text for it in here
379	my $allfields_text = "";
380	foreach my $field (split (/;/, $fields)) {
381	if ($field eq "allfields") {
382	$allfields_index = 1;
383	} elsif ($field eq "metadata") {
384	$all_metadata_specified = 1;
385	}
386	}
387
388	foreach my $field (split (/;/, $fields)) {
389
390	# only deal with this field if it doesn't start with top or
391	# this is the first section
392	my $real_field = $field;
393	next if (($real_field =~ s/^top//) && ($doc_section != 1));
394
395	# process these two later
396	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
397
398	#individual metadata and or text specified - could be a comma separated list
399	$specified_fields->{$real_field} = 1;
400	my $shortname="";
401	my $new_field = 0; # have we found a new field name?
402	if (defined $self->{'indexfieldmap'}->{$real_field}) {
403	$shortname = $self->{'indexfieldmap'}->{$real_field};
404	}
405	else {
406	$shortname = $self->create_shortname($real_field);
407	$new_field = 1;
408	}
409
410	my @metadata_list = (); # put any metadata values in here
411	my $section_text = ""; # put the text in here
412	foreach my $submeta (split /,/, $real_field) {
413	if ($submeta eq "text") {
414	# no point in indexing text more than once
415	if ($section_text eq "") {
416	$section_text = $doc_obj->get_text($section);
417	if ($self->{'indexing_text'}) {
418	# we always strip html
419	$section_text = $self->preprocess_text($section_text, 1, "");
420	}
421	else {
422	# leave html stuff in, but escape the tags
423	&ghtml::htmlsafe($section_text);
424	}
425	}
426	}
427	else {
428	$submeta =~ s/^ex\.//; #strip off ex.
429
430	# its a metadata element
431	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
432	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
433	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
434	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
435	}
436	}
437	push (@metadata_list, @section_metadata);
438	}
439	} # for each field in this one index
440
441	# now we add the text and/or metadata into new_text
442	if ($section_text ne "" \|\| scalar(@metadata_list)) {
443	my $new_text = "";
444
445	if ($section_text ne "") {
446	$new_text .= "$section_text ";
447	}
448
449	foreach my $item (@metadata_list) {
450	&ghtml::htmlsafe($item);
451	$new_text .= "$item ";
452	}
453
454	if ($allfields_index) {
455	$allfields_text .= $new_text;
456	}
457
458	# Remove any leading or trailing white space
459	$new_text =~ s/\s+$//;
460	$new_text =~ s/^\s+//;
461
462
463	if ($self->{'indexing_text'}) {
464	# add the tag
465	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
466	}
467	# filter the text
468	$new_text = $self->filter_text ($field, $new_text);
469
470	if ($edit_mode eq "add") {
471	$self->{'num_processed_bytes'} += length ($new_text);
472	$text .= "$new_text";
473	}
474	elsif ($edit_mode eq "update") {
475	$text .= "$new_text";
476	}
477	elsif ($edit_mode eq "delete") {
478	$self->{'num_processed_bytes'} -= length ($new_text);
479	}
480
481
482	if ($self->{'indexing_text'} && $new_field) {
483	# we need to add to the list in indexfields
484
485	$self->{'indexfieldmap'}->{$real_field} = $shortname;
486	$self->{'indexfieldmap'}->{$shortname} = 1;
487	}
488
489	}
490
491	} # foreach field
492
493
494	if ($all_metadata_specified) {
495
496	my $new_text = "";
497	my $shortname = "";
498	my $metadata = $doc_obj->get_all_metadata ($section);
499	foreach my $pair (@$metadata) {
500	my ($mfield, $mvalue) = (@$pair);
501
502	# no value
503	next unless defined $mvalue && $mvalue ne "";
504
505	# we have already indexed this
506	next if defined ($specified_fields->{$mfield});
507
508	# check fields here, maybe others dont want - change to use dontindex!!
509	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
510	next if ($mfield =~ /^gsdl/);
511
512	&ghtml::htmlsafe($mvalue);
513
514	if (defined $self->{'indexfieldmap'}->{$mfield}) {
515	$shortname = $self->{'indexfieldmap'}->{$mfield};
516	}
517	else {
518	$shortname = $self->create_shortname($mfield);
519	$self->{'indexfieldmap'}->{$mfield} = $shortname;
520	$self->{'indexfieldmap'}->{$shortname} = 1;
521	}
522	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
523	if ($allfields_index) {
524	$allfields_text .= "$mvalue ";
525	}
526
527	if (!defined $self->{'indexfields'}->{$mfield}) {
528	$self->{'indexfields'}->{$mfield} = 1;
529	}
530
531	}
532	# filter the text
533	$new_text = $self->filter_text ("metadata", $new_text);
534
535	if ($edit_mode eq "add") {
536	$self->{'num_processed_bytes'} += length ($new_text);
537	$text .= "$new_text";
538	}
539	elsif ($edit_mode eq "update") {
540	$text .= "$new_text";
541	}
542	elsif ($edit_mode eq "delete") {
543	$self->{'num_processed_bytes'} -= length ($new_text);
544	}
545	}
546
547	if ($allfields_index) {
548	# add the index name mapping
549	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
550	$self->{'indexfieldmap'}->{"ZZ"} = 1;
551
552	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
553	# filter the text
554	$new_text = $self->filter_text ("allfields", $new_text);
555
556	if ($edit_mode eq "add") {
557	$self->{'num_processed_bytes'} += length ($new_text);
558	$text .= "$new_text";
559	}
560	elsif ($edit_mode eq "update") {
561	$text .= "$new_text";
562	}
563	elsif ($edit_mode eq "delete") {
564	$self->{'num_processed_bytes'} -= length ($new_text);
565	}
566	}
567
568	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
569	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
570	# add sort fields if there are any
571	my $seenfields = {};
572	foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
573	print STDERR "sort/facet field = $sfield\n";
574	# ignore special field rank
575	next if $sfield eq "rank";
576	# ignore any we have already done - we may have duplicates in the sort and facet lists
577	next if (defined $seenfields->{$sfield});
578	print STDERR "processing it\n";
579	$seenfields->{$sfield} = 1;
580	my $sf_shortname;
581	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
582	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
583	}
584	else {
585	$sf_shortname = $self->create_sortfield_shortname($sfield);
586	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
587	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
588	}
589	my @metadata_list = (); # put any metadata values in here
590	foreach my $submeta (split /,/, $sfield) {
591	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
592
593	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
594	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
595	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
596	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
597	}
598	}
599	push (@metadata_list, @section_metadata);
600	}
601	my $new_text = "";
602	foreach my $item (@metadata_list) {
603	&ghtml::htmlsafe($item);
604	$new_text .= "$item";
605	}
606	if ($new_text =~ /\S/) {
607	#$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
608	$new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
609	# filter the text???
610	$text .= "$new_text"; # add it to the main text block
611	print STDERR "adding in sort text $new_text\n";
612	$self->{'actualsortfields'}->{$sfield} = 1;
613	}
614	}
615	}
616
617	# add in end tag if at top-level doc root, or indexing at the section level
618	$text .= $end_sec if ($sec_tag_name ne "");
619
620	$section = $doc_obj->get_next_section($section);
621	} # while defined section
622
623
624	# only output if working with doc level
625	$text .= $end_doc if ($sec_tag_name eq "");
626
627	## $text .= "<commit/>\n";
628	open (TEXTOUT, ">text.out");
629	print TEXTOUT "$text";
630	close TEXTOUT;
631
632	print $solrhandle $text;
633
634	}
635
636
637
638
639	sub textreindex
640	{
641	my $self = shift (@_);
642	my ($doc_obj,$file) = @_;
643
644	$self->textedit($doc_obj,$file,"update");
645	}
646
647
648	1;
649
650

Note: See TracBrowser for help on using the repository browser.

Download in other formats: