Context Navigation

solrbuildproc.pm@ 29945

Last change on this file since 29945 was 29945, checked in by ak19, 9 years ago

Incremental building now works (again) for solr. The changes were 1. deleting a document should only contain the delete xml tags and an inner tag of the doc id, not the entire text and metadata as is necessary when adding a document. 2. Reindexing in solrbuildproc used to call textedit with the mode set to update, but update is not implemented in solrbuildproc, instead it now calls textedit first with the mode set to delete and then with the mode set to add.

File size: 19.3 KB

Line
1	###########################################################################
2	#
3	# solrbuildproc.pm -- perl wrapper for building index with Solr
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package solrbuildproc;
27
28	# This document processor outputs a document for solr to process
29
30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31	# whose use was then extended to Lucene, Solr has its own XML syntax:
32	#
33	# http://wiki.apache.org/solr/UpdateXmlMessages
34	#
35	# Using this means we don't need to write SolrWrapper.jar, as had to be
36	# done for Lucene, translating the XML syntax piped to it into appropriate
37	# calls to the Lucene API
38
39
40	use lucenebuildproc;
41	use ghtml;
42	use strict;
43	no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46	use IncrementalBuildUtils;
47
48	sub BEGIN {
49	@solrbuildproc::ISA = ('lucenebuildproc');
50	}
51
52
53	sub new {
54	my $class = shift @_;
55	my $self = new lucenebuildproc (@_);
56
57	return bless $self, $class;
58	}
59
60	sub set_facetfields {
61	my $self = shift (@_);
62
63	my ($facetfields) = @_;
64	$self->{'facetfields'} = ();
65	# lets just go through and check for text, allfields, metadata which are only valid for indexes, not for facetfields
66	foreach my $s (@$facetfields) {
67	if ($s !~ /^(text\|allfields\|metadata)$/) {
68	push (@{$self->{'facetfields'}}, $s);
69	}
70	}
71	}
72
73	#----
74
75	sub index_field_mapping_edit {
76	my $self = shift (@_);
77	my ($doc_obj,$file,$edit_mode) = @_;
78
79	# Only add/update gets to here
80	# Currently there is no need to distinguish between these edit modes
81
82	my $outhandle = $self->{'outhandle'};
83
84	# only study this document if it is one to be indexed
85	return if ($doc_obj->get_doc_type() ne "indexed_doc");
86
87	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
88
89	# get the parameters for the output
90	# split on : just in case there is subcoll and lang stuff
91	my ($fields) = split (/:/, $self->{'index'});
92
93	my $doc_section = 0; # just for this document
94
95	# get the text for this document
96	my $section = $doc_obj->get_top_section();
97
98	while (defined $section)
99	{
100	$doc_section++;
101
102	# if we are doing subcollections, then some docs shouldn't be
103	# considered for indexing
104
105	my $indexed_section
106	= $doc_obj->get_metadata_element($section, "gsdldoctype")
107	\|\| "indexed_section";
108
109	if (($indexed_doc == 0)
110	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
111	$section = $doc_obj->get_next_section($section);
112	next;
113	}
114
115	# has the user added a 'metadata' index?
116	my $all_metadata_specified = 0;
117
118	# which fields have already been indexed?
119	# (same as fields, but in a map)
120	my $specified_fields = {};
121
122	# do we have an allfields index??
123	my $allfields_index = 0;
124
125	# collect up all the text for it in here
126	my $allfields_text = "";
127
128	foreach my $field (split (/;/, $fields)) {
129	if ($field eq "allfields") {
130	$allfields_index = 1;
131	} elsif ($field eq "metadata") {
132	$all_metadata_specified = 1;
133	}
134	}
135
136	foreach my $field (split (/;/, $fields)) {
137
138	# only deal with this field if it doesn't start with top or
139	# this is the first section
140	my $real_field = $field;
141	next if (($real_field =~ s/^top//) && ($doc_section != 1));
142
143	# process these two later
144	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
145
146	# individual metadata and or text specified
147	# -- could be a comma separated list
148	$specified_fields->{$real_field} = 1;
149
150	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
151	my $shortname = $self->create_shortname($real_field);
152	$self->{'indexfieldmap'}->{$real_field} = $shortname;
153	$self->{'indexfieldmap'}->{$shortname} = 1;
154	}
155	} # foreach field
156
157
158	if ($all_metadata_specified) {
159
160	my $new_text = "";
161	my $shortname = "";
162	my $metadata = $doc_obj->get_all_metadata ($section);
163
164	foreach my $pair (@$metadata) {
165	my ($mfield, $mvalue) = (@$pair);
166
167	# no value
168	next unless defined $mvalue && $mvalue ne "";
169
170	# we have already indexed this
171	next if defined ($specified_fields->{$mfield});
172
173	# check fields here, maybe others dont want - change to use dontindex!!
174	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
175	next if ($mfield =~ /^gsdl/);
176
177	if (defined $self->{'indexfieldmap'}->{$mfield}) {
178	$shortname = $self->{'indexfieldmap'}->{$mfield};
179	}
180	else {
181	$shortname = $self->create_shortname($mfield);
182	$self->{'indexfieldmap'}->{$mfield} = $shortname;
183	$self->{'indexfieldmap'}->{$shortname} = 1;
184	}
185
186	if (!defined $self->{'indexfields'}->{$mfield}) {
187	$self->{'indexfields'}->{$mfield} = 1;
188	}
189	}
190	}
191
192	if ($allfields_index) {
193	# add the index name mapping
194	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
195	$self->{'indexfieldmap'}->{"ZZ"} = 1;
196	}
197
198	$section = $doc_obj->get_next_section($section);
199
200	} # while defined section
201
202
203	}
204
205	sub index_field_mapping {
206	my $self = shift (@_);
207	my ($doc_obj,$file) = @_;
208
209	$self->index_field_mapping_edit($doc_obj,$file,"add");
210	}
211
212	sub index_field_mappingreindex
213	{
214	my $self = shift (@_);
215	my ($doc_obj,$file) = @_;
216
217	$self->index_field_mapping_edit($doc_obj,$file,"update");
218	}
219
220	sub index_field_mappingdelete
221	{
222	my $self = shift (@_);
223	my ($doc_obj,$file) = @_;
224
225	return; # nothing to be done
226	}
227
228
229	#----
230
231	sub textedit {
232	my $self = shift (@_);
233	my ($doc_obj,$file,$edit_mode) = @_;
234
235
236	if (!$self->get_indexing_text()) {
237	# In text-compress mode:
238	# => want document to be output in the simple <Doc>..</Doc> as is
239	# done by its super-class
240	return $self->SUPER::textedit(@_);
241	}
242
243	# "update" for $edit_mode near identical to "add" as we use Solr in its
244	# default mode of replacing an existing document if the new document
245	# has the same doc id. Main area of difference between "add" and "update"
246	# is that we do not update our 'stats' for number of documents or number
247	# of bytes processed. The latter is inaccurate, but considered better
248	# than allowing the value to steadily climb.
249
250
251	my $solrhandle = $self->{'output_handle'};
252	my $outhandle = $self->{'outhandle'};
253
254	# only output this document if it is one to be indexed
255	return if ($doc_obj->get_doc_type() ne "indexed_doc");
256
257	# skip this document if in "compress-text" mode and asked to delete it
258	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
259
260	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
261
262	# this is another document
263	if ($edit_mode eq "add") {
264	$self->{'num_docs'} += 1;
265	}
266	elsif ($edit_mode eq "delete") {
267	$self->{'num_docs'} -= 1;
268	}
269
270	# get the parameters for the output
271	# split on : just in case there is subcoll and lang stuff
272	my ($fields) = split (/:/, $self->{'index'});
273
274	my $levels = $self->{'levels'};
275	my $ldoc_level = $levels->{'document'};
276	my $lsec_level = $levels->{'section'};
277
278	my $gs2_docOID = $doc_obj->get_OID();
279
280	my $start_doc;
281	my $end_doc;
282
283	if ($edit_mode eq "add") {
284	$start_doc = " <add>\n";
285	$start_doc .= " <doc>\n";
286	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
287
288	$end_doc = " </doc>\n";
289	$end_doc .= " </add>\n";
290	}
291	else {
292	$start_doc = " <delete>\n";
293	$start_doc .= " <id>$gs2_docOID</id>\n";
294
295	$end_doc = " </delete>\n";
296
297	# for delete mode, we need to specify just the docOID to delete and we're done
298	my $text = $start_doc;
299	$text .= $end_doc;
300	print $solrhandle $text;
301	return;
302	}
303
304	# add/update, delete
305
306	my $sec_tag_name = "";
307	if ($lsec_level)
308	{
309	$sec_tag_name = $mgppbuildproc::level_map{'section'};
310	}
311
312	my $doc_section = 0; # just for this document
313
314	# only output if working with doc level
315	# my $text = undef;
316
317	my $text = ($sec_tag_name eq "") ? $start_doc : "";
318
319	# my $text = $start_doc if ($sec_tag_name eq "");
320
321	# get the text for this document
322	my $section = $doc_obj->get_top_section();
323
324	while (defined $section)
325	{
326	# update a few statistics
327	$doc_section++;
328	$self->{'num_sections'}++;
329
330	my $sec_gs2_id = $self->{'num_sections'};
331	my $sec_gs2_docOID = $gs2_docOID;
332	$sec_gs2_docOID .= ".$section" if ($section ne "");
333
334	my $start_sec;
335	my $end_sec;
336
337	if ($edit_mode eq "add") {
338	$start_sec = " <add>\n";
339	$start_sec .= " <doc>\n";
340	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
341
342	$end_sec = " </doc>\n";
343	$end_sec .= " </add>\n";
344	}
345	else {
346	$start_sec = " <delete>\n";
347	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
348
349	$end_sec = " </delete>\n";
350
351	# for delete mode, should specify only this section's docOID to delete, then move on to the next section
352	my $text = $start_sec;
353	$text .= $end_sec;
354	print $solrhandle $text;
355	$section = $doc_obj->get_next_section($section);
356	next;
357	}
358
359
360	# if we are doing subcollections, then some docs shouldn't be indexed.
361	# but we need to put the section tag placeholders in there so the
362	# sections match up with database
363	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
364	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
365	if ($sec_tag_name ne "") {
366	$text .= $start_sec;
367	$text .= $end_sec;
368	}
369	$section = $doc_obj->get_next_section($section);
370	next;
371	}
372
373	# add in start section tag if indexing at the section level
374	$text .= $start_sec if ($sec_tag_name ne "");
375
376	if ($edit_mode eq "add") {
377	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
378	}
379	elsif ($edit_mode eq "delete") {
380	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
381	}
382
383
384	# has the user added a 'metadata' index?
385	my $all_metadata_specified = 0;
386	# which fields have already been indexed? (same as fields, but in a map)
387	my $specified_fields = {};
388
389	# do we have an allfields index??
390	my $allfields_index = 0;
391	# collect up all the text for it in here
392	my $allfields_text = "";
393	foreach my $field (split (/;/, $fields)) {
394	if ($field eq "allfields") {
395	$allfields_index = 1;
396	} elsif ($field eq "metadata") {
397	$all_metadata_specified = 1;
398	}
399	}
400
401	foreach my $field (split (/;/, $fields)) {
402
403	# only deal with this field if it doesn't start with top or
404	# this is the first section
405	my $real_field = $field;
406	next if (($real_field =~ s/^top//) && ($doc_section != 1));
407
408	# process these two later
409	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
410
411	#individual metadata and or text specified - could be a comma separated list
412	$specified_fields->{$real_field} = 1;
413	my $shortname="";
414	my $new_field = 0; # have we found a new field name?
415	if (defined $self->{'indexfieldmap'}->{$real_field}) {
416	$shortname = $self->{'indexfieldmap'}->{$real_field};
417	}
418	else {
419	$shortname = $self->create_shortname($real_field);
420	$new_field = 1;
421	}
422
423	my @metadata_list = (); # put any metadata values in here
424	my $section_text = ""; # put the text in here
425	foreach my $submeta (split /,/, $real_field) {
426	if ($submeta eq "text") {
427	# no point in indexing text more than once
428	if ($section_text eq "") {
429	$section_text = $doc_obj->get_text($section);
430	if ($self->{'indexing_text'}) {
431	# we always strip html
432	$section_text = $self->preprocess_text($section_text, 1, "");
433	}
434	else {
435	# leave html stuff in, but escape the tags
436	&ghtml::htmlsafe($section_text);
437	}
438	}
439	}
440	else {
441	$submeta =~ s/^ex\.//; #strip off ex.
442
443	# its a metadata element
444	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
445	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
446	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
447	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
448	}
449	}
450	push (@metadata_list, @section_metadata);
451	}
452	} # for each field in this one index
453
454	# now we add the text and/or metadata into new_text
455	if ($section_text ne "" \|\| scalar(@metadata_list)) {
456	my $new_text = "";
457
458	if ($section_text ne "") {
459	$new_text .= "$section_text ";
460	}
461
462	foreach my $item (@metadata_list) {
463	&ghtml::htmlsafe($item);
464	$new_text .= "$item ";
465	}
466
467	if ($allfields_index) {
468	$allfields_text .= $new_text;
469	}
470
471	# Remove any leading or trailing white space
472	$new_text =~ s/\s+$//;
473	$new_text =~ s/^\s+//;
474
475
476	if ($self->{'indexing_text'}) {
477	# add the tag
478	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
479	}
480	# filter the text
481	$new_text = $self->filter_text ($field, $new_text);
482
483	if ($edit_mode eq "add") {
484	$self->{'num_processed_bytes'} += length ($new_text);
485	$text .= "$new_text";
486	}
487	elsif ($edit_mode eq "update") {
488	$text .= "$new_text";
489	}
490	elsif ($edit_mode eq "delete") {
491	$self->{'num_processed_bytes'} -= length ($new_text);
492	}
493
494
495	if ($self->{'indexing_text'} && $new_field) {
496	# we need to add to the list in indexfields
497
498	$self->{'indexfieldmap'}->{$real_field} = $shortname;
499	$self->{'indexfieldmap'}->{$shortname} = 1;
500	}
501
502	}
503
504	} # foreach field
505
506
507	if ($all_metadata_specified) {
508
509	my $new_text = "";
510	my $shortname = "";
511	my $metadata = $doc_obj->get_all_metadata ($section);
512	foreach my $pair (@$metadata) {
513	my ($mfield, $mvalue) = (@$pair);
514
515	# no value
516	next unless defined $mvalue && $mvalue ne "";
517
518	# we have already indexed this
519	next if defined ($specified_fields->{$mfield});
520
521	# check fields here, maybe others dont want - change to use dontindex!!
522	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
523	next if ($mfield =~ /^gsdl/);
524
525	&ghtml::htmlsafe($mvalue);
526
527	if (defined $self->{'indexfieldmap'}->{$mfield}) {
528	$shortname = $self->{'indexfieldmap'}->{$mfield};
529	}
530	else {
531	$shortname = $self->create_shortname($mfield);
532	$self->{'indexfieldmap'}->{$mfield} = $shortname;
533	$self->{'indexfieldmap'}->{$shortname} = 1;
534	}
535	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
536	if ($allfields_index) {
537	$allfields_text .= "$mvalue ";
538	}
539
540	if (!defined $self->{'indexfields'}->{$mfield}) {
541	$self->{'indexfields'}->{$mfield} = 1;
542	}
543
544	}
545	# filter the text
546	$new_text = $self->filter_text ("metadata", $new_text);
547
548	if ($edit_mode eq "add") {
549	$self->{'num_processed_bytes'} += length ($new_text);
550	$text .= "$new_text";
551	}
552	elsif ($edit_mode eq "update") {
553	$text .= "$new_text";
554	}
555	elsif ($edit_mode eq "delete") {
556	$self->{'num_processed_bytes'} -= length ($new_text);
557	}
558	}
559
560	if ($allfields_index) {
561	# add the index name mapping
562	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
563	$self->{'indexfieldmap'}->{"ZZ"} = 1;
564
565	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
566	# filter the text
567	$new_text = $self->filter_text ("allfields", $new_text);
568
569	if ($edit_mode eq "add") {
570	$self->{'num_processed_bytes'} += length ($new_text);
571	$text .= "$new_text";
572	}
573	elsif ($edit_mode eq "update") {
574	$text .= "$new_text";
575	}
576	elsif ($edit_mode eq "delete") {
577	$self->{'num_processed_bytes'} -= length ($new_text);
578	}
579	}
580
581	# only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
582	if ($self->{'indexing_text'} && ($sec_tag_name ne "" \|\| $doc_section == 1 )) {
583	# add sort fields if there are any
584	my $seenfields = {};
585	foreach my $sfield (@{$self->{'sortfields'}}, @{$self->{'facetfields'}}) {
586	# ignore special field rank/none
587	next if $sfield eq "rank" \|\| $sfield eq "none";
588	# ignore any we have already done - we may have duplicates in the sort and facet lists
589	next if (defined $seenfields->{$sfield});
590	$seenfields->{$sfield} = 1;
591	my $sf_shortname;
592	if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
593	$sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
594	}
595	else {
596	$sf_shortname = $self->create_sortfield_shortname($sfield);
597	$self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
598	$self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
599	}
600	my @metadata_list = (); # put any metadata values in here
601	foreach my $submeta (split /,/, $sfield) {
602	$submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
603
604	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
605	if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
606	if ($self->{'sections_sort_on_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
607	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
608	}
609	}
610	push (@metadata_list, @section_metadata);
611	}
612	my $new_text = "";
613	foreach my $item (@metadata_list) {
614	&ghtml::htmlsafe($item);
615	$new_text .= "$item ";
616	}
617	if ($new_text =~ /\S/) {
618	$new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
619	# filter the text???
620	$text .= "$new_text"; # add it to the main text block
621	$self->{'actualsortfields'}->{$sfield} = 1;
622	}
623	}
624	}
625
626	# add in end tag if at top-level doc root, or indexing at the section level
627	$text .= $end_sec if ($sec_tag_name ne "");
628
629	$section = $doc_obj->get_next_section($section);
630	} # while defined section
631
632
633	# only output if working with doc level
634	$text .= $end_doc if ($sec_tag_name eq "");
635
636	## $text .= "<commit/>\n";
637
638	# The following code looks like it's for debugging purposes, but
639	# committed by accident. Commenting out for now ...
640
641	# open(TEXTOUT, '>:utf8', "text.out");
642	# print TEXTOUT "$text";
643	# close TEXTOUT;
644
645	print $solrhandle $text;
646
647	}
648
649
650
651
652	sub textreindex
653	{
654	my $self = shift (@_);
655	my ($doc_obj,$file) = @_;
656
657	# the update command does not exist in solrbuildproc
658	# reindexing consists of deleting and then adding the same file
659	#$self->textedit($doc_obj,$file,"update");
660	$self->textedit($doc_obj,$file,"delete");
661	$self->textedit($doc_obj,$file,"add");
662	}
663
664
665	1;
666
667

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 29945

Download in other formats: