Context Navigation

source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 24483

Last change on this file since 24483 was 24447, checked in by davidb, 13 years ago
Tidy up of code (removing commented out redundant code), plus tweaking of code that starts and stops jetty to cope with situation where the server is already running
File size: 15.9 KB

Line
1	###########################################################################
2	#
3	# solrbuildproc.pm -- perl wrapper for building index with Solr
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	package solrbuildproc;
27
28	# This document processor outputs a document for solr to process
29
30	# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31	# whose use was then extended to Lucene, Solr has its own XML syntax:
32	#
33	# http://wiki.apache.org/solr/UpdateXmlMessages
34	#
35	# Using this means we don't need to write SolrWrapper.jar, as had to be
36	# done for Lucene, translating the XML syntax piped to it into appropriate
37	# calls to the Lucene API
38
39
40	use lucenebuildproc;
41	use ghtml;
42	use strict;
43	no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46	use IncrementalBuildUtils;
47
48	sub BEGIN {
49	@solrbuildproc::ISA = ('lucenebuildproc');
50	}
51
52
53	sub new {
54	my $class = shift @_;
55	my $self = new lucenebuildproc (@_);
56
57	return bless $self, $class;
58	}
59
60
61	#----
62
63	sub index_field_mapping_edit {
64	my $self = shift (@_);
65	my ($doc_obj,$file,$edit_mode) = @_;
66
67	# Only add/update gets to here
68	# Currently there is no need to distinguish between these edit modes
69
70	my $outhandle = $self->{'outhandle'};
71
72	# only study this document if it is one to be indexed
73	return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77	# get the parameters for the output
78	# split on : just in case there is subcoll and lang stuff
79	my ($fields) = split (/:/, $self->{'index'});
80
81	my $doc_section = 0; # just for this document
82
83	# get the text for this document
84	my $section = $doc_obj->get_top_section();
85
86	while (defined $section)
87	{
88	$doc_section++;
89
90	# if we are doing subcollections, then some docs shouldn't be
91	# considered for indexing
92
93	my $indexed_section
94	= $doc_obj->get_metadata_element($section, "gsdldoctype")
95	\|\| "indexed_section";
96
97	if (($indexed_doc == 0)
98	\|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99	$section = $doc_obj->get_next_section($section);
100	next;
101	}
102
103	# has the user added a 'metadata' index?
104	my $all_metadata_specified = 0;
105
106	# which fields have already been indexed?
107	# (same as fields, but in a map)
108	my $specified_fields = {};
109
110	# do we have an allfields index??
111	my $allfields_index = 0;
112
113	# collect up all the text for it in here
114	my $allfields_text = "";
115
116	foreach my $field (split (/;/, $fields)) {
117	if ($field eq "allfields") {
118	$allfields_index = 1;
119	} elsif ($field eq "metadata") {
120	$all_metadata_specified = 1;
121	}
122	}
123
124	foreach my $field (split (/;/, $fields)) {
125
126	# only deal with this field if it doesn't start with top or
127	# this is the first section
128	my $real_field = $field;
129	next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131	# process these two later
132	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
133
134	# individual metadata and or text specified
135	# -- could be a comma separated list
136	$specified_fields->{$real_field} = 1;
137
138	if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139	my $shortname = $self->create_shortname($real_field);
140	$self->{'indexfieldmap'}->{$real_field} = $shortname;
141	$self->{'indexfieldmap'}->{$shortname} = 1;
142	}
143	} # foreach field
144
145
146	if ($all_metadata_specified) {
147
148	my $new_text = "";
149	my $shortname = "";
150	my $metadata = $doc_obj->get_all_metadata ($section);
151
152	foreach my $pair (@$metadata) {
153	my ($mfield, $mvalue) = (@$pair);
154
155	# no value
156	next unless defined $mvalue && $mvalue ne "";
157
158	# we have already indexed this
159	next if defined ($specified_fields->{$mfield});
160
161	# check fields here, maybe others dont want - change to use dontindex!!
162	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
163	next if ($mfield =~ /^gsdl/);
164
165	if (defined $self->{'indexfieldmap'}->{$mfield}) {
166	$shortname = $self->{'indexfieldmap'}->{$mfield};
167	}
168	else {
169	$shortname = $self->create_shortname($mfield);
170	$self->{'indexfieldmap'}->{$mfield} = $shortname;
171	$self->{'indexfieldmap'}->{$shortname} = 1;
172	}
173
174	if (!defined $self->{'indexfields'}->{$mfield}) {
175	$self->{'indexfields'}->{$mfield} = 1;
176	}
177	}
178	}
179
180	if ($allfields_index) {
181	# add the index name mapping
182	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183	$self->{'indexfieldmap'}->{"ZZ"} = 1;
184	}
185
186	$section = $doc_obj->get_next_section($section);
187
188	} # while defined section
189
190
191	}
192
193	sub index_field_mapping {
194	my $self = shift (@_);
195	my ($doc_obj,$file) = @_;
196
197	$self->index_field_mapping_edit($doc_obj,$file,"add");
198	}
199
200	sub index_field_mappingreindex
201	{
202	my $self = shift (@_);
203	my ($doc_obj,$file) = @_;
204
205	$self->index_field_mapping_edit($doc_obj,$file,"update");
206	}
207
208	sub index_field_mappingdelete
209	{
210	my $self = shift (@_);
211	my ($doc_obj,$file) = @_;
212
213	return; # nothing to be done
214	}
215
216
217	#----
218
219	sub textedit {
220	my $self = shift (@_);
221	my ($doc_obj,$file,$edit_mode) = @_;
222
223
224	if (!$self->get_indexing_text()) {
225	# In text-compress mode:
226	# => want document to be output in the simple <Doc>..</Doc> as is
227	# done by its super-class
228	return $self->SUPER::textedit(@_);
229	}
230
231	# "update" for $edit_mode near identical to "add" as we use Solr in its
232	# default mode of replacing an existing document if the new document
233	# has the same doc id. Main area of difference between "add" and "update"
234	# is that we do not update our 'stats' for number of documents or number
235	# of bytes processed. The latter is inaccurate, but considered better
236	# than allowing the value to steadily climb.
237
238
239	my $solrhandle = $self->{'output_handle'};
240	my $outhandle = $self->{'outhandle'};
241
242	# only output this document if it is one to be indexed
243	return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245	# skip this document if in "compress-text" mode and asked to delete it
246	return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248	my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250	# this is another document
251	if ($edit_mode eq "add") {
252	$self->{'num_docs'} += 1;
253	}
254	elsif ($edit_mode eq "delete") {
255	$self->{'num_docs'} -= 1;
256	}
257
258	# get the parameters for the output
259	# split on : just in case there is subcoll and lang stuff
260	my ($fields) = split (/:/, $self->{'index'});
261
262	my $levels = $self->{'levels'};
263	my $ldoc_level = $levels->{'document'};
264	my $lsec_level = $levels->{'section'};
265
266	my $gs2_docOID = $doc_obj->get_OID();
267
268
269	my $start_doc;
270	my $end_doc;
271
272	if ($edit_mode eq "add") {
273	$start_doc = " <add>\n";
274	$start_doc .= " <doc>\n";
275	$start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
276
277	$end_doc = " </doc>\n";
278	$end_doc .= " </add>\n";
279	}
280	else {
281	$start_doc = " <delete>\n";
282	$start_doc .= " <id>$gs2_docOID</id>\n";
283
284	$end_doc = " </delete>\n";
285	}
286
287	# add/update, delete
288
289	my $sec_tag_name = "";
290	if ($lsec_level)
291	{
292	$sec_tag_name = $mgppbuildproc::level_map{'section'};
293	}
294
295	my $doc_section = 0; # just for this document
296
297	# only output if working with doc level
298	my $text = $start_doc if ($sec_tag_name eq "");
299
300	# get the text for this document
301	my $section = $doc_obj->get_top_section();
302
303	while (defined $section)
304	{
305	# update a few statistics
306	$doc_section++;
307	$self->{'num_sections'}++;
308
309	my $sec_gs2_id = $self->{'num_sections'};
310	my $sec_gs2_docOID = $gs2_docOID;
311	$sec_gs2_docOID .= ".$section" if ($section ne "");
312
313	my $start_sec;
314	my $end_sec;
315
316	if ($edit_mode eq "add") {
317	$start_sec = " <add>\n";
318	$start_sec .= " <doc>\n";
319	$start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
320
321	$end_sec = " </doc>\n";
322	$end_sec .= " </add>\n";
323	}
324	else {
325	$start_sec = " <delete>\n";
326	$start_sec .= " <id>$sec_gs2_docOID</id>\n";
327
328	$end_sec = " </delete>\n";
329	}
330
331
332	# if we are doing subcollections, then some docs shouldn't be indexed.
333	# but we need to put the section tag placeholders in there so the
334	# sections match up with database
335	my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") \|\| "indexed_section";
336	if (($indexed_doc == 0) \|\| ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
337	if ($sec_tag_name ne "") {
338	$text .= $start_sec;
339	$text .= $end_sec;
340	}
341	$section = $doc_obj->get_next_section($section);
342	next;
343	}
344
345	# add in start section tag if indexing at the section level
346	$text .= $start_sec if ($sec_tag_name ne "");
347
348	if ($edit_mode eq "add") {
349	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
350	}
351	elsif ($edit_mode eq "delete") {
352	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
353	}
354
355
356	# has the user added a 'metadata' index?
357	my $all_metadata_specified = 0;
358	# which fields have already been indexed? (same as fields, but in a map)
359	my $specified_fields = {};
360
361	# do we have an allfields index??
362	my $allfields_index = 0;
363	# collect up all the text for it in here
364	my $allfields_text = "";
365	foreach my $field (split (/;/, $fields)) {
366	if ($field eq "allfields") {
367	$allfields_index = 1;
368	} elsif ($field eq "metadata") {
369	$all_metadata_specified = 1;
370	}
371	}
372
373	foreach my $field (split (/;/, $fields)) {
374
375	# only deal with this field if it doesn't start with top or
376	# this is the first section
377	my $real_field = $field;
378	next if (($real_field =~ s/^top//) && ($doc_section != 1));
379
380	# process these two later
381	next if ($real_field eq "allfields" \|\| $real_field eq "metadata");
382
383	#individual metadata and or text specified - could be a comma separated list
384	$specified_fields->{$real_field} = 1;
385	my $shortname="";
386	my $new_field = 0; # have we found a new field name?
387	if (defined $self->{'indexfieldmap'}->{$real_field}) {
388	$shortname = $self->{'indexfieldmap'}->{$real_field};
389	}
390	else {
391	$shortname = $self->create_shortname($real_field);
392	$new_field = 1;
393	}
394
395	my @metadata_list = (); # put any metadata values in here
396	my $section_text = ""; # put the text in here
397	foreach my $submeta (split /,/, $real_field) {
398	if ($submeta eq "text") {
399	# no point in indexing text more than once
400	if ($section_text eq "") {
401	$section_text = $doc_obj->get_text($section);
402	if ($self->{'indexing_text'}) {
403	# we always strip html
404	$section_text = $self->preprocess_text($section_text, 1, "");
405	}
406	else {
407	# leave html stuff in, but escape the tags
408	&ghtml::htmlsafe($section_text);
409	}
410	}
411	}
412	else {
413	$submeta =~ s/^ex\.//; #strip off ex.
414
415	# its a metadata element
416	my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
417	if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
418	if ($self->{'sections_index_document_metadata'} eq "always" \|\| ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
419	push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
420	}
421	}
422	push (@metadata_list, @section_metadata);
423	}
424	} # for each field in this one index
425
426	# now we add the text and/or metadata into new_text
427	if ($section_text ne "" \|\| scalar(@metadata_list)) {
428	my $new_text = "";
429
430	if ($section_text ne "") {
431	$new_text .= "$section_text ";
432	}
433
434	foreach my $item (@metadata_list) {
435	&ghtml::htmlsafe($item);
436	$new_text .= "$item ";
437	}
438
439	if ($allfields_index) {
440	$allfields_text .= $new_text;
441	}
442
443	if ($self->{'indexing_text'}) {
444	# add the tag
445	$new_text = "<field name=\"$shortname\" >$new_text</field>\n";
446	}
447	# filter the text
448	$new_text = $self->filter_text ($field, $new_text);
449
450	if ($edit_mode eq "add") {
451	$self->{'num_processed_bytes'} += length ($new_text);
452	$text .= "$new_text";
453	}
454	elsif ($edit_mode eq "update") {
455	$text .= "$new_text";
456	}
457	elsif ($edit_mode eq "delete") {
458	$self->{'num_processed_bytes'} -= length ($new_text);
459	}
460
461
462	if ($self->{'indexing_text'} && $new_field) {
463	# we need to add to the list in indexfields
464
465	$self->{'indexfieldmap'}->{$real_field} = $shortname;
466	$self->{'indexfieldmap'}->{$shortname} = 1;
467	}
468
469	}
470
471	} # foreach field
472
473
474	if ($all_metadata_specified) {
475
476	my $new_text = "";
477	my $shortname = "";
478	my $metadata = $doc_obj->get_all_metadata ($section);
479	foreach my $pair (@$metadata) {
480	my ($mfield, $mvalue) = (@$pair);
481
482	# no value
483	next unless defined $mvalue && $mvalue ne "";
484
485	# we have already indexed this
486	next if defined ($specified_fields->{$mfield});
487
488	# check fields here, maybe others dont want - change to use dontindex!!
489	next if ($mfield eq "Identifier" \|\| $mfield eq "classifytype" \|\| $mfield eq "assocfilepath");
490	next if ($mfield =~ /^gsdl/);
491
492	&ghtml::htmlsafe($mvalue);
493
494	if (defined $self->{'indexfieldmap'}->{$mfield}) {
495	$shortname = $self->{'indexfieldmap'}->{$mfield};
496	}
497	else {
498	$shortname = $self->create_shortname($mfield);
499	$self->{'indexfieldmap'}->{$mfield} = $shortname;
500	$self->{'indexfieldmap'}->{$shortname} = 1;
501	}
502	$new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
503	if ($allfields_index) {
504	$allfields_text .= "$mvalue ";
505	}
506
507	if (!defined $self->{'indexfields'}->{$mfield}) {
508	$self->{'indexfields'}->{$mfield} = 1;
509	}
510
511	}
512	# filter the text
513	$new_text = $self->filter_text ("metadata", $new_text);
514
515	if ($edit_mode eq "add") {
516	$self->{'num_processed_bytes'} += length ($new_text);
517	$text .= "$new_text";
518	}
519	elsif ($edit_mode eq "update") {
520	$text .= "$new_text";
521	}
522	elsif ($edit_mode eq "delete") {
523	$self->{'num_processed_bytes'} -= length ($new_text);
524	}
525	}
526
527	if ($allfields_index) {
528	# add the index name mapping
529	$self->{'indexfieldmap'}->{"allfields"} = "ZZ";
530	$self->{'indexfieldmap'}->{"ZZ"} = 1;
531
532	my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
533	# filter the text
534	$new_text = $self->filter_text ("allfields", $new_text);
535
536	if ($edit_mode eq "add") {
537	$self->{'num_processed_bytes'} += length ($new_text);
538	$text .= "$new_text";
539	}
540	elsif ($edit_mode eq "update") {
541	$text .= "$new_text";
542	}
543	elsif ($edit_mode eq "delete") {
544	$self->{'num_processed_bytes'} -= length ($new_text);
545	}
546	}
547
548	# add in end tag if at top-level doc root, or indexing at the section level
549	$text .= $end_sec if ($sec_tag_name ne "");
550
551	$section = $doc_obj->get_next_section($section);
552	} # while defined section
553
554
555	# only output if working with doc level
556	$text .= $end_doc if ($sec_tag_name eq "");
557
558	## $text .= "<commit/>\n";
559
560	print $solrhandle $text;
561
562	}
563
564
565
566
567	sub textreindex
568	{
569	my $self = shift (@_);
570	my ($doc_obj,$file) = @_;
571
572	$self->textedit($doc_obj,$file,"update");
573	}
574
575
576	1;
577
578

Note: See TracBrowser for help on using the repository browser.

Download in other formats: