source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 24446

Last change on this file since 24446 was 24446, checked in by davidb, 13 years ago

Start of Solr extension for Greenstone3

File size: 16.0 KB
Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64 my $self = shift (@_);
65 my ($doc_obj,$file,$edit_mode) = @_;
66
67 # Only add/update gets to here
68 # Currently there is no need to distinguish between these edit modes
69
70 my $outhandle = $self->{'outhandle'};
71
72 # only study this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77 # get the parameters for the output
78 # split on : just in case there is subcoll and lang stuff
79 my ($fields) = split (/:/, $self->{'index'});
80
81 my $doc_section = 0; # just for this document
82
83 # get the text for this document
84 my $section = $doc_obj->get_top_section();
85
86 while (defined $section)
87 {
88 $doc_section++;
89
90 # if we are doing subcollections, then some docs shouldn't be
91 # considered for indexing
92
93 my $indexed_section
94 = $doc_obj->get_metadata_element($section, "gsdldoctype")
95 || "indexed_section";
96
97 if (($indexed_doc == 0)
98 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99 $section = $doc_obj->get_next_section($section);
100 next;
101 }
102
103 # has the user added a 'metadata' index?
104 my $all_metadata_specified = 0;
105
106 # which fields have already been indexed?
107 # (same as fields, but in a map)
108 my $specified_fields = {};
109
110 # do we have an allfields index??
111 my $allfields_index = 0;
112
113 # collect up all the text for it in here
114 my $allfields_text = "";
115
116 foreach my $field (split (/;/, $fields)) {
117 if ($field eq "allfields") {
118 $allfields_index = 1;
119 } elsif ($field eq "metadata") {
120 $all_metadata_specified = 1;
121 }
122 }
123
124 foreach my $field (split (/;/, $fields)) {
125
126 # only deal with this field if it doesn't start with top or
127 # this is the first section
128 my $real_field = $field;
129 next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131 # process these two later
132 next if ($real_field eq "allfields" || $real_field eq "metadata");
133
134 # individual metadata and or text specified
135 # -- could be a comma separated list
136 $specified_fields->{$real_field} = 1;
137
138 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139 my $shortname = $self->create_shortname($real_field);
140 $self->{'indexfieldmap'}->{$real_field} = $shortname;
141 $self->{'indexfieldmap'}->{$shortname} = 1;
142 }
143 } # foreach field
144
145
146 if ($all_metadata_specified) {
147
148 my $new_text = "";
149 my $shortname = "";
150 my $metadata = $doc_obj->get_all_metadata ($section);
151
152 foreach my $pair (@$metadata) {
153 my ($mfield, $mvalue) = (@$pair);
154
155 # no value
156 next unless defined $mvalue && $mvalue ne "";
157
158 # we have already indexed this
159 next if defined ($specified_fields->{$mfield});
160
161 # check fields here, maybe others dont want - change to use dontindex!!
162 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163 next if ($mfield =~ /^gsdl/);
164
165 if (defined $self->{'indexfieldmap'}->{$mfield}) {
166 $shortname = $self->{'indexfieldmap'}->{$mfield};
167 }
168 else {
169 $shortname = $self->create_shortname($mfield);
170 $self->{'indexfieldmap'}->{$mfield} = $shortname;
171 $self->{'indexfieldmap'}->{$shortname} = 1;
172 }
173
174 if (!defined $self->{'indexfields'}->{$mfield}) {
175 $self->{'indexfields'}->{$mfield} = 1;
176 }
177 }
178 }
179
180 if ($allfields_index) {
181 # add the index name mapping
182 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183 $self->{'indexfieldmap'}->{"ZZ"} = 1;
184 }
185
186 $section = $doc_obj->get_next_section($section);
187
188 } # while defined section
189
190
191}
192
193sub index_field_mapping {
194 my $self = shift (@_);
195 my ($doc_obj,$file) = @_;
196
197 $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202 my $self = shift (@_);
203 my ($doc_obj,$file) = @_;
204
205 $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210 my $self = shift (@_);
211 my ($doc_obj,$file) = @_;
212
213 return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220 my $self = shift (@_);
221 my ($doc_obj,$file,$edit_mode) = @_;
222
223
224 if (!$self->get_indexing_text()) {
225 # In text-compress mode:
226 # => want document to be output in the simple <Doc>..</Doc> as is
227 # done by its super-class
228 return $self->SUPER::textedit(@_);
229 }
230
231 # "update" for $edit_mode near identical to "add" as we use Solr in its
232 # default mode of replacing an existing document if the new document
233 # has the same doc id. Main area of difference between "add" and "update"
234 # is that we do not update our 'stats' for number of documents or number
235 # of bytes processed. The latter is inaccurate, but considered better
236 # than allowing the value to steadily climb.
237
238
239 my $solrhandle = $self->{'output_handle'};
240 my $outhandle = $self->{'outhandle'};
241
242 # only output this document if it is one to be indexed
243 return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245 # skip this document if in "compress-text" mode and asked to delete it
246 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250 # this is another document
251 if ($edit_mode eq "add") {
252 $self->{'num_docs'} += 1;
253 }
254 elsif ($edit_mode eq "delete") {
255 $self->{'num_docs'} -= 1;
256 }
257
258 # get the parameters for the output
259 # split on : just in case there is subcoll and lang stuff
260 my ($fields) = split (/:/, $self->{'index'});
261
262 my $levels = $self->{'levels'};
263 my $ldoc_level = $levels->{'document'};
264 my $lsec_level = $levels->{'section'};
265
266 my $gs2_docOID = $doc_obj->get_OID();
267
268
269 my $start_doc;
270 my $end_doc;
271
272 if ($edit_mode eq "add") {
273 $start_doc = " <add>\n";
274 $start_doc .= " <doc>\n";
275 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
276
277 $end_doc = " </doc>\n";
278 $end_doc .= " </add>\n";
279 }
280 else {
281 $start_doc = " <delete>\n";
282 $start_doc .= " <id>$gs2_docOID</id>\n";
283
284 $end_doc = " </delete>\n";
285 }
286
287 # add/update, delete
288
289 my $sec_tag_name = "";
290 if ($lsec_level)
291 {
292 $sec_tag_name = $mgppbuildproc::level_map{'section'};
293 }
294
295 my $doc_section = 0; # just for this document
296
297 # only output if working with doc level
298 my $text = $start_doc if ($sec_tag_name eq "");
299
300 # get the text for this document
301 my $section = $doc_obj->get_top_section();
302
303 while (defined $section)
304 {
305 # update a few statistics
306 $doc_section++;
307 $self->{'num_sections'}++;
308
309 my $sec_gs2_id = $self->{'num_sections'};
310 my $sec_gs2_docOID = $gs2_docOID;
311 $sec_gs2_docOID .= ".$section" if ($section ne "");
312
313 my $start_sec;
314 my $end_sec;
315
316 if ($edit_mode eq "add") {
317 $start_sec = " <add>\n";
318 $start_sec .= " <doc>\n";
319 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
320
321 $end_sec = " </doc>\n";
322 $end_sec .= " </add>\n";
323 }
324 else {
325 $start_sec = " <delete>\n";
326 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
327
328 $end_sec = " </delete>\n";
329 }
330
331
332 # if we are doing subcollections, then some docs shouldn't be indexed.
333 # but we need to put the section tag placeholders in there so the
334 # sections match up with database
335 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
336 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
337 if ($sec_tag_name ne "") {
338 $text .= $start_sec;
339 $text .= $end_sec;
340 }
341 $section = $doc_obj->get_next_section($section);
342 next;
343 }
344
345 # add in start section tag if indexing at the section level
346 $text .= $start_sec if ($sec_tag_name ne "");
347
348 if ($edit_mode eq "add") {
349 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
350 }
351 elsif ($edit_mode eq "delete") {
352 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
353 }
354
355
356 # has the user added a 'metadata' index?
357 my $all_metadata_specified = 0;
358 # which fields have already been indexed? (same as fields, but in a map)
359 my $specified_fields = {};
360
361 # do we have an allfields index??
362 my $allfields_index = 0;
363 # collect up all the text for it in here
364 my $allfields_text = "";
365 foreach my $field (split (/;/, $fields)) {
366 if ($field eq "allfields") {
367 $allfields_index = 1;
368 } elsif ($field eq "metadata") {
369 $all_metadata_specified = 1;
370 }
371 }
372
373 foreach my $field (split (/;/, $fields)) {
374
375 # only deal with this field if it doesn't start with top or
376 # this is the first section
377 my $real_field = $field;
378 next if (($real_field =~ s/^top//) && ($doc_section != 1));
379
380 # process these two later
381 next if ($real_field eq "allfields" || $real_field eq "metadata");
382
383 #individual metadata and or text specified - could be a comma separated list
384 $specified_fields->{$real_field} = 1;
385 my $shortname="";
386 my $new_field = 0; # have we found a new field name?
387 if (defined $self->{'indexfieldmap'}->{$real_field}) {
388 $shortname = $self->{'indexfieldmap'}->{$real_field};
389 }
390 else {
391 $shortname = $self->create_shortname($real_field);
392 $new_field = 1;
393 }
394
395 my @metadata_list = (); # put any metadata values in here
396 my $section_text = ""; # put the text in here
397 foreach my $submeta (split /,/, $real_field) {
398 if ($submeta eq "text") {
399 # no point in indexing text more than once
400 if ($section_text eq "") {
401 $section_text = $doc_obj->get_text($section);
402 if ($self->{'indexing_text'}) {
403 # we always strip html
404 $section_text = $self->preprocess_text($section_text, 1, "");
405 }
406 else {
407 # leave html stuff in, but escape the tags
408 &ghtml::htmlsafe($section_text);
409 }
410 }
411 }
412 else {
413 $submeta =~ s/^ex\.//; #strip off ex.
414
415 # its a metadata element
416 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
417 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
418 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
419 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
420 }
421 }
422 push (@metadata_list, @section_metadata);
423 }
424 } # for each field in this one index
425
426 # now we add the text and/or metadata into new_text
427 if ($section_text ne "" || scalar(@metadata_list)) {
428 my $new_text = "";
429
430 if ($section_text ne "") {
431 $new_text .= "$section_text ";
432 }
433
434 foreach my $item (@metadata_list) {
435 &ghtml::htmlsafe($item);
436 $new_text .= "$item ";
437 }
438
439 if ($allfields_index) {
440 $allfields_text .= $new_text;
441 }
442
443 if ($self->{'indexing_text'}) {
444 # add the tag
445 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
446 }
447 # filter the text
448 $new_text = $self->filter_text ($field, $new_text);
449
450 if ($edit_mode eq "add") {
451 $self->{'num_processed_bytes'} += length ($new_text);
452 $text .= "$new_text";
453 }
454 elsif ($edit_mode eq "update") {
455 $text .= "$new_text";
456 }
457 elsif ($edit_mode eq "delete") {
458 $self->{'num_processed_bytes'} -= length ($new_text);
459 }
460
461
462 if ($self->{'indexing_text'} && $new_field) {
463 # we need to add to the list in indexfields
464
465 $self->{'indexfieldmap'}->{$real_field} = $shortname;
466 $self->{'indexfieldmap'}->{$shortname} = 1;
467 }
468
469 }
470
471 } # foreach field
472
473
474 if ($all_metadata_specified) {
475
476 my $new_text = "";
477 my $shortname = "";
478 my $metadata = $doc_obj->get_all_metadata ($section);
479 foreach my $pair (@$metadata) {
480 my ($mfield, $mvalue) = (@$pair);
481
482 # no value
483 next unless defined $mvalue && $mvalue ne "";
484
485 # we have already indexed this
486 next if defined ($specified_fields->{$mfield});
487
488 # check fields here, maybe others dont want - change to use dontindex!!
489 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
490 next if ($mfield =~ /^gsdl/);
491
492 &ghtml::htmlsafe($mvalue);
493
494 if (defined $self->{'indexfieldmap'}->{$mfield}) {
495 $shortname = $self->{'indexfieldmap'}->{$mfield};
496 }
497 else {
498 $shortname = $self->create_shortname($mfield);
499 $self->{'indexfieldmap'}->{$mfield} = $shortname;
500 $self->{'indexfieldmap'}->{$shortname} = 1;
501 }
502 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
503 if ($allfields_index) {
504 $allfields_text .= "$mvalue ";
505 }
506
507 if (!defined $self->{'indexfields'}->{$mfield}) {
508 $self->{'indexfields'}->{$mfield} = 1;
509 }
510
511 }
512 # filter the text
513 $new_text = $self->filter_text ("metadata", $new_text);
514
515 if ($edit_mode eq "add") {
516 $self->{'num_processed_bytes'} += length ($new_text);
517 $text .= "$new_text";
518 }
519 elsif ($edit_mode eq "update") {
520 $text .= "$new_text";
521 }
522 elsif ($edit_mode eq "delete") {
523 $self->{'num_processed_bytes'} -= length ($new_text);
524 }
525 }
526
527 if ($allfields_index) {
528 # add the index name mapping
529 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
530 $self->{'indexfieldmap'}->{"ZZ"} = 1;
531
532 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
533 # filter the text
534 $new_text = $self->filter_text ("allfields", $new_text);
535
536 if ($edit_mode eq "add") {
537 $self->{'num_processed_bytes'} += length ($new_text);
538 $text .= "$new_text";
539 }
540 elsif ($edit_mode eq "update") {
541 $text .= "$new_text";
542 }
543 elsif ($edit_mode eq "delete") {
544 $self->{'num_processed_bytes'} -= length ($new_text);
545 }
546 }
547
548 # add in end tag if at top-level doc root, or indexing at the section level
549 $text .= $end_sec if ($sec_tag_name ne "");
550
551 $section = $doc_obj->get_next_section($section);
552 } # while defined section
553
554
555 # only output if working with doc level
556 $text .= $end_doc if ($sec_tag_name eq "");
557
558
559## $text .= "<commit/>\n";
560
561 print $solrhandle $text;
562
563 open(TOUT,">>/tmp/solr.out"); binmode(TOUT,":utf8");
564 print TOUT $text;
565 close(TOUT);
566}
567
568
569
570
571sub textreindex
572{
573 my $self = shift (@_);
574 my ($doc_obj,$file) = @_;
575
576 $self->textedit($doc_obj,$file,"update");
577}
578
579
5801;
581
582
Note: See TracBrowser for help on using the repository browser.