source: gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm@ 27802

Last change on this file since 27802 was 27802, checked in by kjdon, 11 years ago

adding in code for sort fields. just copied form lucene build code

File size: 18.1 KB
RevLine 
[24446]1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33# http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49 @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54 my $class = shift @_;
55 my $self = new lucenebuildproc (@_);
56
57 return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64 my $self = shift (@_);
65 my ($doc_obj,$file,$edit_mode) = @_;
66
67 # Only add/update gets to here
68 # Currently there is no need to distinguish between these edit modes
69
70 my $outhandle = $self->{'outhandle'};
71
72 # only study this document if it is one to be indexed
73 return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77 # get the parameters for the output
78 # split on : just in case there is subcoll and lang stuff
79 my ($fields) = split (/:/, $self->{'index'});
80
81 my $doc_section = 0; # just for this document
82
83 # get the text for this document
84 my $section = $doc_obj->get_top_section();
85
86 while (defined $section)
87 {
88 $doc_section++;
89
90 # if we are doing subcollections, then some docs shouldn't be
91 # considered for indexing
92
93 my $indexed_section
94 = $doc_obj->get_metadata_element($section, "gsdldoctype")
95 || "indexed_section";
96
97 if (($indexed_doc == 0)
98 || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99 $section = $doc_obj->get_next_section($section);
100 next;
101 }
102
103 # has the user added a 'metadata' index?
104 my $all_metadata_specified = 0;
105
106 # which fields have already been indexed?
107 # (same as fields, but in a map)
108 my $specified_fields = {};
109
110 # do we have an allfields index??
111 my $allfields_index = 0;
112
113 # collect up all the text for it in here
114 my $allfields_text = "";
115
116 foreach my $field (split (/;/, $fields)) {
117 if ($field eq "allfields") {
118 $allfields_index = 1;
119 } elsif ($field eq "metadata") {
120 $all_metadata_specified = 1;
121 }
122 }
123
124 foreach my $field (split (/;/, $fields)) {
125
126 # only deal with this field if it doesn't start with top or
127 # this is the first section
128 my $real_field = $field;
129 next if (($real_field =~ s/^top//) && ($doc_section != 1));
130
131 # process these two later
132 next if ($real_field eq "allfields" || $real_field eq "metadata");
133
134 # individual metadata and or text specified
135 # -- could be a comma separated list
136 $specified_fields->{$real_field} = 1;
137
138 if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139 my $shortname = $self->create_shortname($real_field);
140 $self->{'indexfieldmap'}->{$real_field} = $shortname;
141 $self->{'indexfieldmap'}->{$shortname} = 1;
142 }
143 } # foreach field
144
145
146 if ($all_metadata_specified) {
147
148 my $new_text = "";
149 my $shortname = "";
150 my $metadata = $doc_obj->get_all_metadata ($section);
151
152 foreach my $pair (@$metadata) {
153 my ($mfield, $mvalue) = (@$pair);
154
155 # no value
156 next unless defined $mvalue && $mvalue ne "";
157
158 # we have already indexed this
159 next if defined ($specified_fields->{$mfield});
160
161 # check fields here, maybe others dont want - change to use dontindex!!
162 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163 next if ($mfield =~ /^gsdl/);
164
165 if (defined $self->{'indexfieldmap'}->{$mfield}) {
166 $shortname = $self->{'indexfieldmap'}->{$mfield};
167 }
168 else {
169 $shortname = $self->create_shortname($mfield);
170 $self->{'indexfieldmap'}->{$mfield} = $shortname;
171 $self->{'indexfieldmap'}->{$shortname} = 1;
172 }
173
174 if (!defined $self->{'indexfields'}->{$mfield}) {
175 $self->{'indexfields'}->{$mfield} = 1;
176 }
177 }
178 }
179
180 if ($allfields_index) {
181 # add the index name mapping
182 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183 $self->{'indexfieldmap'}->{"ZZ"} = 1;
184 }
185
186 $section = $doc_obj->get_next_section($section);
187
188 } # while defined section
189
190
191}
192
193sub index_field_mapping {
194 my $self = shift (@_);
195 my ($doc_obj,$file) = @_;
196
197 $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202 my $self = shift (@_);
203 my ($doc_obj,$file) = @_;
204
205 $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210 my $self = shift (@_);
211 my ($doc_obj,$file) = @_;
212
213 return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220 my $self = shift (@_);
221 my ($doc_obj,$file,$edit_mode) = @_;
222
223
224 if (!$self->get_indexing_text()) {
225 # In text-compress mode:
226 # => want document to be output in the simple <Doc>..</Doc> as is
227 # done by its super-class
228 return $self->SUPER::textedit(@_);
229 }
230
231 # "update" for $edit_mode near identical to "add" as we use Solr in its
232 # default mode of replacing an existing document if the new document
233 # has the same doc id. Main area of difference between "add" and "update"
234 # is that we do not update our 'stats' for number of documents or number
235 # of bytes processed. The latter is inaccurate, but considered better
236 # than allowing the value to steadily climb.
237
238
239 my $solrhandle = $self->{'output_handle'};
240 my $outhandle = $self->{'outhandle'};
241
242 # only output this document if it is one to be indexed
243 return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245 # skip this document if in "compress-text" mode and asked to delete it
246 return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248 my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250 # this is another document
251 if ($edit_mode eq "add") {
252 $self->{'num_docs'} += 1;
253 }
254 elsif ($edit_mode eq "delete") {
255 $self->{'num_docs'} -= 1;
256 }
257
258 # get the parameters for the output
259 # split on : just in case there is subcoll and lang stuff
260 my ($fields) = split (/:/, $self->{'index'});
261
262 my $levels = $self->{'levels'};
263 my $ldoc_level = $levels->{'document'};
264 my $lsec_level = $levels->{'section'};
265
266 my $gs2_docOID = $doc_obj->get_OID();
267
268 my $start_doc;
269 my $end_doc;
270
271 if ($edit_mode eq "add") {
272 $start_doc = " <add>\n";
273 $start_doc .= " <doc>\n";
274 $start_doc .= " <field name=\"docOID\">$gs2_docOID</field>\n";
275
276 $end_doc = " </doc>\n";
277 $end_doc .= " </add>\n";
278 }
279 else {
280 $start_doc = " <delete>\n";
281 $start_doc .= " <id>$gs2_docOID</id>\n";
282
283 $end_doc = " </delete>\n";
284 }
285
286 # add/update, delete
287
288 my $sec_tag_name = "";
289 if ($lsec_level)
290 {
291 $sec_tag_name = $mgppbuildproc::level_map{'section'};
292 }
293
294 my $doc_section = 0; # just for this document
295
296 # only output if working with doc level
[25846]297 # my $text = undef;
298
299 my $text = ($sec_tag_name eq "") ? $start_doc : "";
[24446]300
[25846]301# my $text = $start_doc if ($sec_tag_name eq "");
302
[24446]303 # get the text for this document
304 my $section = $doc_obj->get_top_section();
305
306 while (defined $section)
307 {
308 # update a few statistics
309 $doc_section++;
310 $self->{'num_sections'}++;
311
312 my $sec_gs2_id = $self->{'num_sections'};
313 my $sec_gs2_docOID = $gs2_docOID;
314 $sec_gs2_docOID .= ".$section" if ($section ne "");
315
316 my $start_sec;
317 my $end_sec;
318
319 if ($edit_mode eq "add") {
320 $start_sec = " <add>\n";
321 $start_sec .= " <doc>\n";
322 $start_sec .= " <field name=\"docOID\">$sec_gs2_docOID</field>\n";
[25846]323
[24446]324 $end_sec = " </doc>\n";
325 $end_sec .= " </add>\n";
326 }
327 else {
328 $start_sec = " <delete>\n";
329 $start_sec .= " <id>$sec_gs2_docOID</id>\n";
330
331 $end_sec = " </delete>\n";
332 }
333
334
335 # if we are doing subcollections, then some docs shouldn't be indexed.
336 # but we need to put the section tag placeholders in there so the
337 # sections match up with database
338 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
339 if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
340 if ($sec_tag_name ne "") {
341 $text .= $start_sec;
342 $text .= $end_sec;
343 }
344 $section = $doc_obj->get_next_section($section);
345 next;
346 }
347
348 # add in start section tag if indexing at the section level
349 $text .= $start_sec if ($sec_tag_name ne "");
350
351 if ($edit_mode eq "add") {
352 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
353 }
354 elsif ($edit_mode eq "delete") {
355 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
356 }
357
358
359 # has the user added a 'metadata' index?
360 my $all_metadata_specified = 0;
361 # which fields have already been indexed? (same as fields, but in a map)
362 my $specified_fields = {};
363
364 # do we have an allfields index??
365 my $allfields_index = 0;
366 # collect up all the text for it in here
367 my $allfields_text = "";
368 foreach my $field (split (/;/, $fields)) {
369 if ($field eq "allfields") {
370 $allfields_index = 1;
371 } elsif ($field eq "metadata") {
372 $all_metadata_specified = 1;
373 }
374 }
375
376 foreach my $field (split (/;/, $fields)) {
377
378 # only deal with this field if it doesn't start with top or
379 # this is the first section
380 my $real_field = $field;
381 next if (($real_field =~ s/^top//) && ($doc_section != 1));
382
383 # process these two later
384 next if ($real_field eq "allfields" || $real_field eq "metadata");
385
386 #individual metadata and or text specified - could be a comma separated list
387 $specified_fields->{$real_field} = 1;
388 my $shortname="";
389 my $new_field = 0; # have we found a new field name?
390 if (defined $self->{'indexfieldmap'}->{$real_field}) {
391 $shortname = $self->{'indexfieldmap'}->{$real_field};
392 }
393 else {
394 $shortname = $self->create_shortname($real_field);
395 $new_field = 1;
396 }
397
398 my @metadata_list = (); # put any metadata values in here
399 my $section_text = ""; # put the text in here
400 foreach my $submeta (split /,/, $real_field) {
401 if ($submeta eq "text") {
402 # no point in indexing text more than once
403 if ($section_text eq "") {
404 $section_text = $doc_obj->get_text($section);
405 if ($self->{'indexing_text'}) {
406 # we always strip html
407 $section_text = $self->preprocess_text($section_text, 1, "");
408 }
409 else {
410 # leave html stuff in, but escape the tags
411 &ghtml::htmlsafe($section_text);
412 }
413 }
414 }
415 else {
416 $submeta =~ s/^ex\.//; #strip off ex.
417
418 # its a metadata element
419 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
420 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
421 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
422 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
423 }
424 }
425 push (@metadata_list, @section_metadata);
426 }
427 } # for each field in this one index
428
429 # now we add the text and/or metadata into new_text
430 if ($section_text ne "" || scalar(@metadata_list)) {
431 my $new_text = "";
432
433 if ($section_text ne "") {
434 $new_text .= "$section_text ";
435 }
436
437 foreach my $item (@metadata_list) {
438 &ghtml::htmlsafe($item);
439 $new_text .= "$item ";
440 }
441
442 if ($allfields_index) {
443 $allfields_text .= $new_text;
444 }
445
[25846]446 # Remove any leading or trailing white space
447 $new_text =~ s/\s+$//;
448 $new_text =~ s/^\s+//;
449
450
[24446]451 if ($self->{'indexing_text'}) {
452 # add the tag
453 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
454 }
455 # filter the text
456 $new_text = $self->filter_text ($field, $new_text);
457
458 if ($edit_mode eq "add") {
459 $self->{'num_processed_bytes'} += length ($new_text);
460 $text .= "$new_text";
461 }
462 elsif ($edit_mode eq "update") {
463 $text .= "$new_text";
464 }
465 elsif ($edit_mode eq "delete") {
466 $self->{'num_processed_bytes'} -= length ($new_text);
467 }
468
469
470 if ($self->{'indexing_text'} && $new_field) {
471 # we need to add to the list in indexfields
472
473 $self->{'indexfieldmap'}->{$real_field} = $shortname;
474 $self->{'indexfieldmap'}->{$shortname} = 1;
475 }
476
477 }
478
479 } # foreach field
480
481
482 if ($all_metadata_specified) {
483
484 my $new_text = "";
485 my $shortname = "";
486 my $metadata = $doc_obj->get_all_metadata ($section);
487 foreach my $pair (@$metadata) {
488 my ($mfield, $mvalue) = (@$pair);
489
490 # no value
491 next unless defined $mvalue && $mvalue ne "";
492
493 # we have already indexed this
494 next if defined ($specified_fields->{$mfield});
495
496 # check fields here, maybe others dont want - change to use dontindex!!
497 next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
498 next if ($mfield =~ /^gsdl/);
499
500 &ghtml::htmlsafe($mvalue);
501
502 if (defined $self->{'indexfieldmap'}->{$mfield}) {
503 $shortname = $self->{'indexfieldmap'}->{$mfield};
504 }
505 else {
506 $shortname = $self->create_shortname($mfield);
507 $self->{'indexfieldmap'}->{$mfield} = $shortname;
508 $self->{'indexfieldmap'}->{$shortname} = 1;
509 }
510 $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
511 if ($allfields_index) {
512 $allfields_text .= "$mvalue ";
513 }
514
515 if (!defined $self->{'indexfields'}->{$mfield}) {
516 $self->{'indexfields'}->{$mfield} = 1;
517 }
518
519 }
520 # filter the text
521 $new_text = $self->filter_text ("metadata", $new_text);
522
523 if ($edit_mode eq "add") {
524 $self->{'num_processed_bytes'} += length ($new_text);
525 $text .= "$new_text";
526 }
527 elsif ($edit_mode eq "update") {
528 $text .= "$new_text";
529 }
530 elsif ($edit_mode eq "delete") {
531 $self->{'num_processed_bytes'} -= length ($new_text);
532 }
533 }
534
535 if ($allfields_index) {
536 # add the index name mapping
537 $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
538 $self->{'indexfieldmap'}->{"ZZ"} = 1;
539
540 my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
541 # filter the text
542 $new_text = $self->filter_text ("allfields", $new_text);
543
544 if ($edit_mode eq "add") {
545 $self->{'num_processed_bytes'} += length ($new_text);
546 $text .= "$new_text";
547 }
548 elsif ($edit_mode eq "update") {
549 $text .= "$new_text";
550 }
551 elsif ($edit_mode eq "delete") {
552 $self->{'num_processed_bytes'} -= length ($new_text);
553 }
554 }
555
[27802]556 # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
557 if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
558 # add sort fields if there are any
559
560 foreach my $sfield (@{$self->{'sortfields'}}) {
561 # ignore special field rank
562 next if $sfield eq "rank";
563 my $sf_shortname;
564 if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
565 $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
566 }
567 else {
568 $sf_shortname = $self->create_sortfield_shortname($sfield);
569 $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
570 $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
571 }
572 my @metadata_list = (); # put any metadata values in here
573 foreach my $submeta (split /,/, $sfield) {
574 $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
575
576 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
577 if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
578 if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
579 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
580 }
581 }
582 push (@metadata_list, @section_metadata);
583 }
584 my $new_text = "";
585 foreach my $item (@metadata_list) {
586 &ghtml::htmlsafe($item);
587 $new_text .= "$item";
588 }
589 if ($new_text =~ /\S/) {
590 #$new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
591 $new_text = "<field name=\"$sf_shortname\">$new_text</field>\n";
592 # filter the text???
593 $text .= "$new_text"; # add it to the main text block
594 print STDERR "adding in sort text $new_text\n";
595 $self->{'actualsortfields'}->{$sfield} = 1;
596 }
597 }
598 }
599
[24446]600 # add in end tag if at top-level doc root, or indexing at the section level
601 $text .= $end_sec if ($sec_tag_name ne "");
602
603 $section = $doc_obj->get_next_section($section);
604 } # while defined section
605
606
607 # only output if working with doc level
608 $text .= $end_doc if ($sec_tag_name eq "");
609
610## $text .= "<commit/>\n";
[27802]611 open (TEXTOUT, ">text.out");
612 print TEXTOUT "$text";
613 close TEXTOUT;
[24446]614
615 print $solrhandle $text;
616
617}
618
619
620
621
622sub textreindex
623{
624 my $self = shift (@_);
625 my ($doc_obj,$file) = @_;
626
627 $self->textedit($doc_obj,$file,"update");
628}
629
630
6311;
632
633
Note: See TracBrowser for help on using the repository browser.