source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 25846

Last change on this file since 25846 was 25846, checked in by sjm84, 12 years ago

Some fixes and additions to the Solr perl code

File size: 16.1 KB
Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64 my $self = shift (@_);
65 my ($doc_obj,$file,$edit_mode) = @_;
66
67 # Only add/update gets to here
68 # Currently there is no need to distinguish between these edit modes
69
70 my $outhandle = $self->{'outhandle'};
71
72 # only study this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77 # get the parameters for the output
78 # split on : just in case there is subcoll and lang stuff
79 my ($fields) = split (/:/, $self->{'index'});
80
81 my $doc_section = 0; # just for this document
82
83 # get the text for this document
84 my $section = $doc_obj->get_top_section();
85
86 while (defined $section)
87 {
88 $doc_section++;
89
90 # if we are doing subcollections, then some docs shouldn't be
91 # considered for indexing
92
93 my $indexed_section
94 = $doc_obj->get_metadata_element($section, "gsdldoctype")
95 || "indexed_section";
96
97 if (($indexed_doc == 0)
98 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99 $section = $doc_obj->get_next_section($section);
100 next;
101 }
102
103 # has the user added a 'metadata' index?
104 my $all_metadata_specified = 0;
105
106 # which fields have already been indexed?
107 # (same as fields, but in a map)
108 my $specified_fields = {};
109
110 # do we have an allfields index??
111 my $allfields_index = 0;
112
113 # collect up all the text for it in here
114 my $allfields_text = "";
115
116 foreach my $field (split (/;/, $fields)) {
117 if ($field eq "allfields") {
118 $allfields_index = 1;
119 } elsif ($field eq "metadata") {
120 $all_metadata_specified = 1;
121 }
122 }
123
124 foreach my $field (split (/;/, $fields)) {
125
126 # only deal with this field if it doesn't start with top or
127 # this is the first section
128 my $real_field = $field;
129 next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131 # process these two later
132 next if ($real_field eq "allfields" || $real_field eq "metadata");
133
134 # individual metadata and or text specified
135 # -- could be a comma separated list
136 $specified_fields->{$real_field} = 1;
137
138 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139 my $shortname = $self->create_shortname($real_field);
140 $self->{'indexfieldmap'}->{$real_field} = $shortname;
141 $self->{'indexfieldmap'}->{$shortname} = 1;
142 }
143 } # foreach field
144
145
146 if ($all_metadata_specified) {
147
148 my $new_text = "";
149 my $shortname = "";
150 my $metadata = $doc_obj->get_all_metadata ($section);
151
152 foreach my $pair (@$metadata) {
153 my ($mfield, $mvalue) = (@$pair);
154
155 # no value
156 next unless defined $mvalue && $mvalue ne "";
157
158 # we have already indexed this
159 next if defined ($specified_fields->{$mfield});
160
161 # check fields here, maybe others dont want - change to use dontindex!!
162 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163 next if ($mfield =~ /^gsdl/);
164
165 if (defined $self->{'indexfieldmap'}->{$mfield}) {
166 $shortname = $self->{'indexfieldmap'}->{$mfield};
167 }
168 else {
169 $shortname = $self->create_shortname($mfield);
170 $self->{'indexfieldmap'}->{$mfield} = $shortname;
171 $self->{'indexfieldmap'}->{$shortname} = 1;
172 }
173
174 if (!defined $self->{'indexfields'}->{$mfield}) {
175 $self->{'indexfields'}->{$mfield} = 1;
176 }
177 }
178 }
179
180 if ($allfields_index) {
181 # add the index name mapping
182 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183 $self->{'indexfieldmap'}->{"ZZ"} = 1;
184 }
185
186 $section = $doc_obj->get_next_section($section);
187
188 } # while defined section
189
190
191}
192
193sub index_field_mapping {
194 my $self = shift (@_);
195 my ($doc_obj,$file) = @_;
196
197 $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202 my $self = shift (@_);
203 my ($doc_obj,$file) = @_;
204
205 $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210 my $self = shift (@_);
211 my ($doc_obj,$file) = @_;
212
213 return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220 my $self = shift (@_);
221 my ($doc_obj,$file,$edit_mode) = @_;
222
223
224 if (!$self->get_indexing_text()) {
225 # In text-compress mode:
226 # => want document to be output in the simple <Doc>..</Doc> as is
227 # done by its super-class
228 return $self->SUPER::textedit(@_);
229 }
230
231 # "update" for $edit_mode near identical to "add" as we use Solr in its
232 # default mode of replacing an existing document if the new document
233 # has the same doc id. Main area of difference between "add" and "update"
234 # is that we do not update our 'stats' for number of documents or number
235 # of bytes processed. The latter is inaccurate, but considered better
236 # than allowing the value to steadily climb.
237
238
239 my $solrhandle = $self->{'output_handle'};
240 my $outhandle = $self->{'outhandle'};
241
242 # only output this document if it is one to be indexed
243 return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245 # skip this document if in "compress-text" mode and asked to delete it
246 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250 # this is another document
251 if ($edit_mode eq "add") {
252 $self->{'num_docs'} += 1;
253 }
254 elsif ($edit_mode eq "delete") {
255 $self->{'num_docs'} -= 1;
256 }
257
258 # get the parameters for the output
259 # split on : just in case there is subcoll and lang stuff
260 my ($fields) = split (/:/, $self->{'index'});
261
262 my $levels = $self->{'levels'};
263 my $ldoc_level = $levels->{'document'};
264 my $lsec_level = $levels->{'section'};
265
266 my $gs2_docOID = $doc_obj->get_OID();
267
268 my $start_doc;
269 my $end_doc;
270
271 if ($edit_mode eq "add") {
272 $start_doc = " <add>\n";
273 $start_doc .= " <doc>\n";
274 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
275
276 $end_doc = " </doc>\n";
277 $end_doc .= " </add>\n";
278 }
279 else {
280 $start_doc = " <delete>\n";
281 $start_doc .= " <id>$gs2_docOID</id>\n";
282
283 $end_doc = " </delete>\n";
284 }
285
286 # add/update, delete
287
288 my $sec_tag_name = "";
289 if ($lsec_level)
290 {
291 $sec_tag_name = $mgppbuildproc::level_map{'section'};
292 }
293
294 my $doc_section = 0; # just for this document
295
296 # only output if working with doc level
297 # my $text = undef;
298
299 my $text = ($sec_tag_name eq "") ? $start_doc : "";
300
301# my $text = $start_doc if ($sec_tag_name eq "");
302
303 # get the text for this document
304 my $section = $doc_obj->get_top_section();
305
306 while (defined $section)
307 {
308 # update a few statistics
309 $doc_section++;
310 $self->{'num_sections'}++;
311
312 my $sec_gs2_id = $self->{'num_sections'};
313 my $sec_gs2_docOID = $gs2_docOID;
314 $sec_gs2_docOID .= ".$section" if ($section ne "");
315
316 my $start_sec;
317 my $end_sec;
318
319 if ($edit_mode eq "add") {
320 $start_sec = " <add>\n";
321 $start_sec .= " <doc>\n";
322 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
323
324 $end_sec = " </doc>\n";
325 $end_sec .= " </add>\n";
326 }
327 else {
328 $start_sec = " <delete>\n";
329 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
330
331 $end_sec = " </delete>\n";
332 }
333
334
335 # if we are doing subcollections, then some docs shouldn't be indexed.
336 # but we need to put the section tag placeholders in there so the
337 # sections match up with database
338 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
339 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
340 if ($sec_tag_name ne "") {
341 $text .= $start_sec;
342 $text .= $end_sec;
343 }
344 $section = $doc_obj->get_next_section($section);
345 next;
346 }
347
348 # add in start section tag if indexing at the section level
349 $text .= $start_sec if ($sec_tag_name ne "");
350
351 if ($edit_mode eq "add") {
352 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
353 }
354 elsif ($edit_mode eq "delete") {
355 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
356 }
357
358
359 # has the user added a 'metadata' index?
360 my $all_metadata_specified = 0;
361 # which fields have already been indexed? (same as fields, but in a map)
362 my $specified_fields = {};
363
364 # do we have an allfields index??
365 my $allfields_index = 0;
366 # collect up all the text for it in here
367 my $allfields_text = "";
368 foreach my $field (split (/;/, $fields)) {
369 if ($field eq "allfields") {
370 $allfields_index = 1;
371 } elsif ($field eq "metadata") {
372 $all_metadata_specified = 1;
373 }
374 }
375
376 foreach my $field (split (/;/, $fields)) {
377
378 # only deal with this field if it doesn't start with top or
379 # this is the first section
380 my $real_field = $field;
381 next if (($real_field =~ s/^top//) && ($doc_section != 1));
382
383 # process these two later
384 next if ($real_field eq "allfields" || $real_field eq "metadata");
385
386 #individual metadata and or text specified - could be a comma separated list
387 $specified_fields->{$real_field} = 1;
388 my $shortname="";
389 my $new_field = 0; # have we found a new field name?
390 if (defined $self->{'indexfieldmap'}->{$real_field}) {
391 $shortname = $self->{'indexfieldmap'}->{$real_field};
392 }
393 else {
394 $shortname = $self->create_shortname($real_field);
395 $new_field = 1;
396 }
397
398 my @metadata_list = (); # put any metadata values in here
399 my $section_text = ""; # put the text in here
400 foreach my $submeta (split /,/, $real_field) {
401 if ($submeta eq "text") {
402 # no point in indexing text more than once
403 if ($section_text eq "") {
404 $section_text = $doc_obj->get_text($section);
405 if ($self->{'indexing_text'}) {
406 # we always strip html
407 $section_text = $self->preprocess_text($section_text, 1, "");
408 }
409 else {
410 # leave html stuff in, but escape the tags
411 &ghtml::htmlsafe($section_text);
412 }
413 }
414 }
415 else {
416 $submeta =~ s/^ex\.//; #strip off ex.
417
418 # its a metadata element
419 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
420 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
421 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
422 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
423 }
424 }
425 push (@metadata_list, @section_metadata);
426 }
427 } # for each field in this one index
428
429 # now we add the text and/or metadata into new_text
430 if ($section_text ne "" || scalar(@metadata_list)) {
431 my $new_text = "";
432
433 if ($section_text ne "") {
434 $new_text .= "$section_text ";
435 }
436
437 foreach my $item (@metadata_list) {
438 &ghtml::htmlsafe($item);
439 $new_text .= "$item ";
440 }
441
442 if ($allfields_index) {
443 $allfields_text .= $new_text;
444 }
445
446 # Remove any leading or trailing white space
447 $new_text =~ s/\s+$//;
448 $new_text =~ s/^\s+//;
449
450
451 if ($self->{'indexing_text'}) {
452 # add the tag
453 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
454 }
455 # filter the text
456 $new_text = $self->filter_text ($field, $new_text);
457
458 if ($edit_mode eq "add") {
459 $self->{'num_processed_bytes'} += length ($new_text);
460 $text .= "$new_text";
461 }
462 elsif ($edit_mode eq "update") {
463 $text .= "$new_text";
464 }
465 elsif ($edit_mode eq "delete") {
466 $self->{'num_processed_bytes'} -= length ($new_text);
467 }
468
469
470 if ($self->{'indexing_text'} && $new_field) {
471 # we need to add to the list in indexfields
472
473 $self->{'indexfieldmap'}->{$real_field} = $shortname;
474 $self->{'indexfieldmap'}->{$shortname} = 1;
475 }
476
477 }
478
479 } # foreach field
480
481
482 if ($all_metadata_specified) {
483
484 my $new_text = "";
485 my $shortname = "";
486 my $metadata = $doc_obj->get_all_metadata ($section);
487 foreach my $pair (@$metadata) {
488 my ($mfield, $mvalue) = (@$pair);
489
490 # no value
491 next unless defined $mvalue && $mvalue ne "";
492
493 # we have already indexed this
494 next if defined ($specified_fields->{$mfield});
495
496 # check fields here, maybe others dont want - change to use dontindex!!
497 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
498 next if ($mfield =~ /^gsdl/);
499
500 &ghtml::htmlsafe($mvalue);
501
502 if (defined $self->{'indexfieldmap'}->{$mfield}) {
503 $shortname = $self->{'indexfieldmap'}->{$mfield};
504 }
505 else {
506 $shortname = $self->create_shortname($mfield);
507 $self->{'indexfieldmap'}->{$mfield} = $shortname;
508 $self->{'indexfieldmap'}->{$shortname} = 1;
509 }
510 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
511 if ($allfields_index) {
512 $allfields_text .= "$mvalue ";
513 }
514
515 if (!defined $self->{'indexfields'}->{$mfield}) {
516 $self->{'indexfields'}->{$mfield} = 1;
517 }
518
519 }
520 # filter the text
521 $new_text = $self->filter_text ("metadata", $new_text);
522
523 if ($edit_mode eq "add") {
524 $self->{'num_processed_bytes'} += length ($new_text);
525 $text .= "$new_text";
526 }
527 elsif ($edit_mode eq "update") {
528 $text .= "$new_text";
529 }
530 elsif ($edit_mode eq "delete") {
531 $self->{'num_processed_bytes'} -= length ($new_text);
532 }
533 }
534
535 if ($allfields_index) {
536 # add the index name mapping
537 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
538 $self->{'indexfieldmap'}->{"ZZ"} = 1;
539
540 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
541 # filter the text
542 $new_text = $self->filter_text ("allfields", $new_text);
543
544 if ($edit_mode eq "add") {
545 $self->{'num_processed_bytes'} += length ($new_text);
546 $text .= "$new_text";
547 }
548 elsif ($edit_mode eq "update") {
549 $text .= "$new_text";
550 }
551 elsif ($edit_mode eq "delete") {
552 $self->{'num_processed_bytes'} -= length ($new_text);
553 }
554 }
555
556 # add in end tag if at top-level doc root, or indexing at the section level
557 $text .= $end_sec if ($sec_tag_name ne "");
558
559 $section = $doc_obj->get_next_section($section);
560 } # while defined section
561
562
563 # only output if working with doc level
564 $text .= $end_doc if ($sec_tag_name eq "");
565
566## $text .= "<commit/>\n";
567
568 print $solrhandle $text;
569
570}
571
572
573
574
575sub textreindex
576{
577 my $self = shift (@_);
578 my ($doc_obj,$file) = @_;
579
580 $self->textedit($doc_obj,$file,"update");
581}
582
583
5841;
585
586
Note: See TracBrowser for help on using the repository browser.