root/gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm @ 25846

Revision 25846, 16.1 KB (checked in by sjm84, 7 years ago)

Some fixes and additions to the Solr perl code

Line 
1###########################################################################
2#
3# solrbuildproc.pm -- perl wrapper for building index with Solr
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package solrbuildproc;
27
28# This document processor outputs a document for solr to process
29
30# Rather then use the XML structure developed for mgppbuilder/mgppbuildproc
31# whose use was then extended to Lucene, Solr has its own XML syntax:
32#
33#  http://wiki.apache.org/solr/UpdateXmlMessages
34#
35# Using this means we don't need to write SolrWrapper.jar, as had to be
36# done for Lucene, translating the XML syntax piped to it into appropriate
37# calls to the Lucene API
38
39
40use lucenebuildproc;
41use ghtml;
42use strict;
43no strict 'refs'; # allow filehandles to be variables and viceversa
44
45
46use IncrementalBuildUtils;
47
48sub BEGIN {
49    @solrbuildproc::ISA = ('lucenebuildproc');
50}
51
52
53sub new {
54    my $class = shift @_;
55    my $self = new lucenebuildproc (@_);
56
57    return bless $self, $class;
58}
59
60
61#----
62
63sub index_field_mapping_edit {
64    my $self = shift (@_);
65    my ($doc_obj,$file,$edit_mode) = @_;
66
67    # Only add/update gets to here
68    # Currently there is no need to distinguish between these edit modes
69
70    my $outhandle = $self->{'outhandle'};
71
72    # only study this document if it is one to be indexed
73    return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
76
77    # get the parameters for the output
78    # split on : just in case there is subcoll and lang stuff
79    my ($fields) = split (/:/, $self->{'index'});
80
81    my $doc_section = 0; # just for this document
82
83    # get the text for this document
84    my $section = $doc_obj->get_top_section();
85
86    while (defined $section)
87    {
88    $doc_section++;
89
90    # if we are doing subcollections, then some docs shouldn't be
91    # considered for indexing
92
93    my $indexed_section
94        = $doc_obj->get_metadata_element($section, "gsdldoctype")
95          || "indexed_section";
96
97    if (($indexed_doc == 0)
98        || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
99            $section = $doc_obj->get_next_section($section);
100        next;
101          }
102
103    # has the user added a 'metadata' index?
104    my $all_metadata_specified = 0;
105
106    # which fields have already been indexed?
107    # (same as fields, but in a map)
108    my $specified_fields = {};
109   
110    # do we have an allfields index??
111    my $allfields_index = 0;
112
113    # collect up all the text for it in here
114    my $allfields_text = "";
115
116    foreach my $field (split (/;/, $fields)) {
117        if ($field eq "allfields") {
118        $allfields_index = 1;
119        } elsif ($field eq "metadata") {
120        $all_metadata_specified = 1;
121        }
122    }
123   
124    foreach my $field (split (/;/, $fields)) {
125       
126        # only deal with this field if it doesn't start with top or
127        # this is the first section
128        my $real_field = $field;
129        next if (($real_field =~ s/^top//) && ($doc_section != 1));
130       
131        # process these two later
132        next if ($real_field eq "allfields" || $real_field eq "metadata");
133       
134        # individual metadata and or text specified
135        # -- could be a comma separated list
136        $specified_fields->{$real_field} = 1;
137
138        if (!defined $self->{'indexfieldmap'}->{$real_field}) {
139        my $shortname = $self->create_shortname($real_field);
140        $self->{'indexfieldmap'}->{$real_field} = $shortname;
141        $self->{'indexfieldmap'}->{$shortname} = 1;
142        }       
143    } # foreach field
144
145
146    if ($all_metadata_specified) {
147       
148        my $new_text = "";
149        my $shortname = "";
150        my $metadata = $doc_obj->get_all_metadata ($section);
151
152        foreach my $pair (@$metadata) {
153        my ($mfield, $mvalue) = (@$pair);
154
155        # no value
156        next unless defined $mvalue && $mvalue ne "";
157
158        # we have already indexed this
159        next if defined ($specified_fields->{$mfield});
160
161        # check fields here, maybe others dont want - change to use dontindex!!
162        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
163        next if ($mfield =~ /^gsdl/);
164       
165        if (defined $self->{'indexfieldmap'}->{$mfield}) {
166            $shortname = $self->{'indexfieldmap'}->{$mfield};
167        }
168        else {
169            $shortname = $self->create_shortname($mfield);
170            $self->{'indexfieldmap'}->{$mfield} = $shortname;
171            $self->{'indexfieldmap'}->{$shortname} = 1;
172        }     
173
174        if (!defined $self->{'indexfields'}->{$mfield}) {
175            $self->{'indexfields'}->{$mfield} = 1;
176        }                           
177        }
178    }
179
180    if ($allfields_index) {
181        # add the index name mapping
182        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
183        $self->{'indexfieldmap'}->{"ZZ"} = 1;               
184    }
185       
186        $section = $doc_obj->get_next_section($section);
187
188    } # while defined section
189
190   
191}
192
193sub index_field_mapping {
194    my $self = shift (@_);
195    my ($doc_obj,$file) = @_;
196
197    $self->index_field_mapping_edit($doc_obj,$file,"add");
198}
199
200sub index_field_mappingreindex
201{
202    my $self = shift (@_);
203    my ($doc_obj,$file) = @_;
204
205    $self->index_field_mapping_edit($doc_obj,$file,"update");
206}
207
208sub index_field_mappingdelete
209{
210    my $self = shift (@_);
211    my ($doc_obj,$file) = @_;
212
213    return; # nothing to be done
214}
215
216
217#----
218
219sub textedit {
220    my $self = shift (@_);
221    my ($doc_obj,$file,$edit_mode) = @_;
222
223
224    if (!$self->get_indexing_text()) {
225    # In text-compress mode:
226    # => want document to be output in the simple <Doc>..</Doc> as is
227    # done by its super-class
228    return $self->SUPER::textedit(@_);
229    }
230
231    # "update" for $edit_mode near identical to "add" as we use Solr in its
232    # default mode of replacing an existing document if the new document
233    # has the same doc id.  Main area of difference between "add" and "update"
234    # is that we do not update our 'stats' for number of documents or number
235    # of bytes processed.  The latter is inaccurate, but considered better
236    # than allowing the value to steadily climb.
237
238
239    my $solrhandle = $self->{'output_handle'};
240    my $outhandle = $self->{'outhandle'};
241
242    # only output this document if it is one to be indexed
243    return if ($doc_obj->get_doc_type() ne "indexed_doc");
244
245    # skip this document if in "compress-text" mode and asked to delete it
246    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
247
248    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
249
250    # this is another document
251    if ($edit_mode eq "add") {
252    $self->{'num_docs'} += 1;
253    }
254    elsif ($edit_mode eq "delete") {
255    $self->{'num_docs'} -= 1;
256    }
257
258    # get the parameters for the output
259    # split on : just in case there is subcoll and lang stuff
260    my ($fields) = split (/:/, $self->{'index'});
261
262    my $levels = $self->{'levels'};
263    my $ldoc_level = $levels->{'document'};
264    my $lsec_level = $levels->{'section'};
265
266    my $gs2_docOID = $doc_obj->get_OID();
267
268    my $start_doc;
269    my $end_doc;
270
271    if ($edit_mode eq "add") {
272    $start_doc  = "  <add>\n";
273    $start_doc .= "    <doc>\n";
274    $start_doc .= "      <field name=\"docOID\">$gs2_docOID</field>\n";
275   
276    $end_doc    = "    </doc>\n";
277    $end_doc   .= "  </add>\n";
278    }
279    else {
280    $start_doc  = "  <delete>\n";
281    $start_doc .= "    <id>$gs2_docOID</id>\n";
282
283    $end_doc    = "  </delete>\n";
284    }
285
286    # add/update, delete
287
288    my $sec_tag_name = "";
289    if ($lsec_level)
290    {
291    $sec_tag_name = $mgppbuildproc::level_map{'section'};
292    }
293
294    my $doc_section = 0; # just for this document
295
296    # only output if working with doc level
297    # my $text = undef;
298   
299    my $text = ($sec_tag_name eq "") ? $start_doc : "";
300
301#     my $text = $start_doc if ($sec_tag_name eq "");
302     
303    # get the text for this document
304    my $section = $doc_obj->get_top_section();
305
306    while (defined $section)
307    {
308    # update a few statistics
309    $doc_section++;
310    $self->{'num_sections'}++;
311
312    my $sec_gs2_id = $self->{'num_sections'};
313    my $sec_gs2_docOID = $gs2_docOID;
314    $sec_gs2_docOID .= ".$section" if ($section ne "");
315   
316    my $start_sec;
317    my $end_sec;
318
319    if ($edit_mode eq "add") {
320        $start_sec  = "  <add>\n";
321        $start_sec .= "    <doc>\n";
322        $start_sec .= "      <field name=\"docOID\">$sec_gs2_docOID</field>\n";
323   
324        $end_sec    = "    </doc>\n";
325        $end_sec   .= "  </add>\n";
326    }
327    else {
328        $start_sec  = "  <delete>\n";
329        $start_sec .= "    <id>$sec_gs2_docOID</id>\n";
330
331        $end_sec    = "  </delete>\n";
332    }
333
334
335    # if we are doing subcollections, then some docs shouldn't be indexed.
336    # but we need to put the section tag placeholders in there so the
337    # sections match up with database
338    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
339    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
340        if ($sec_tag_name ne "") {
341        $text .= $start_sec;
342        $text .= $end_sec;
343        }
344            $section = $doc_obj->get_next_section($section);
345        next;
346          }
347
348    # add in start section tag if indexing at the section level
349    $text .= $start_sec if ($sec_tag_name ne "");
350
351    if ($edit_mode eq "add") {
352        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
353    }
354    elsif ($edit_mode eq "delete") {
355        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
356    }
357
358
359    # has the user added a 'metadata' index?
360    my $all_metadata_specified = 0;
361    # which fields have already been indexed? (same as fields, but in a map)
362    my $specified_fields = {};
363   
364    # do we have an allfields index??
365    my $allfields_index = 0;
366    # collect up all the text for it in here
367    my $allfields_text = "";
368    foreach my $field (split (/;/, $fields)) {
369        if ($field eq "allfields") {
370        $allfields_index = 1;
371        } elsif ($field eq "metadata") {
372        $all_metadata_specified = 1;
373        }
374    }
375   
376    foreach my $field (split (/;/, $fields)) {
377       
378        # only deal with this field if it doesn't start with top or
379        # this is the first section
380        my $real_field = $field;
381        next if (($real_field =~ s/^top//) && ($doc_section != 1));
382       
383        # process these two later
384        next if ($real_field eq "allfields" || $real_field eq "metadata");
385       
386        #individual metadata and or text specified - could be a comma separated list
387        $specified_fields->{$real_field} = 1;
388        my $shortname="";
389        my $new_field = 0; # have we found a new field name?
390        if (defined $self->{'indexfieldmap'}->{$real_field}) {
391        $shortname = $self->{'indexfieldmap'}->{$real_field};
392        }
393        else {
394        $shortname = $self->create_shortname($real_field);
395        $new_field = 1;
396        }
397
398        my @metadata_list = (); # put any metadata values in here
399        my $section_text = ""; # put the text in here
400        foreach my $submeta (split /,/, $real_field) {
401        if ($submeta eq "text") {
402            # no point in indexing text more than once
403            if ($section_text eq "") {
404            $section_text = $doc_obj->get_text($section);
405            if ($self->{'indexing_text'}) {
406                # we always strip html
407                $section_text = $self->preprocess_text($section_text, 1, "");
408            }
409            else {
410                # leave html stuff in, but escape the tags
411                &ghtml::htmlsafe($section_text);
412            }
413            }
414        }
415        else {
416            $submeta =~ s/^ex\.//; #strip off ex.
417
418            # its a metadata element
419            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
420            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
421            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
422                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
423            }
424            }
425            push (@metadata_list, @section_metadata);
426        }
427        } # for each field in this one index
428       
429        # now we add the text and/or metadata into new_text
430        if ($section_text ne "" || scalar(@metadata_list)) {
431        my $new_text = "";
432       
433        if ($section_text ne "") {
434            $new_text .= "$section_text ";
435        }
436       
437        foreach my $item (@metadata_list) {
438            &ghtml::htmlsafe($item);
439            $new_text .= "$item ";
440        }
441
442        if ($allfields_index) {
443            $allfields_text .= $new_text;
444        }
445
446        # Remove any leading or trailing white space
447        $new_text =~ s/\s+$//;
448        $new_text =~ s/^\s+//;
449   
450       
451        if ($self->{'indexing_text'}) {
452            # add the tag
453            $new_text = "<field name=\"$shortname\" >$new_text</field>\n";
454        }
455        # filter the text
456        $new_text = $self->filter_text ($field, $new_text);
457
458        if ($edit_mode eq "add") {
459            $self->{'num_processed_bytes'} += length ($new_text);
460            $text .= "$new_text";
461        }
462        elsif ($edit_mode eq "update") {
463            $text .= "$new_text";
464        }
465        elsif ($edit_mode eq "delete") {
466            $self->{'num_processed_bytes'} -= length ($new_text);
467        }
468       
469
470        if ($self->{'indexing_text'} && $new_field) {
471            # we need to add to the list in indexfields
472           
473            $self->{'indexfieldmap'}->{$real_field} = $shortname;
474            $self->{'indexfieldmap'}->{$shortname} = 1;
475        }
476       
477        }
478       
479    } # foreach field
480
481
482    if ($all_metadata_specified) {
483       
484        my $new_text = "";
485        my $shortname = "";
486        my $metadata = $doc_obj->get_all_metadata ($section);
487        foreach my $pair (@$metadata) {
488        my ($mfield, $mvalue) = (@$pair);
489
490        # no value
491        next unless defined $mvalue && $mvalue ne "";
492
493        # we have already indexed this
494        next if defined ($specified_fields->{$mfield});
495
496        # check fields here, maybe others dont want - change to use dontindex!!
497        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
498        next if ($mfield =~ /^gsdl/);
499       
500        &ghtml::htmlsafe($mvalue);
501       
502        if (defined $self->{'indexfieldmap'}->{$mfield}) {
503            $shortname = $self->{'indexfieldmap'}->{$mfield};
504        }
505        else {
506            $shortname = $self->create_shortname($mfield);
507            $self->{'indexfieldmap'}->{$mfield} = $shortname;
508            $self->{'indexfieldmap'}->{$shortname} = 1;
509        }     
510        $new_text .= "<field name=\"$shortname\">$mvalue</field>\n";
511        if ($allfields_index) {
512            $allfields_text .= "$mvalue ";
513        }
514
515        if (!defined $self->{'indexfields'}->{$mfield}) {
516            $self->{'indexfields'}->{$mfield} = 1;
517        }                   
518       
519        }
520        # filter the text
521        $new_text = $self->filter_text ("metadata", $new_text);
522       
523        if ($edit_mode eq "add") {
524        $self->{'num_processed_bytes'} += length ($new_text);
525        $text .= "$new_text";
526        }
527        elsif ($edit_mode eq "update") {
528        $text .= "$new_text";
529        }
530        elsif ($edit_mode eq "delete") {
531        $self->{'num_processed_bytes'} -= length ($new_text);
532        }       
533    }
534
535    if ($allfields_index) {
536        # add the index name mapping
537        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
538        $self->{'indexfieldmap'}->{"ZZ"} = 1;
539       
540        my $new_text = "<field name=\"ZZ\">$allfields_text</field>\n";
541        # filter the text
542        $new_text = $self->filter_text ("allfields", $new_text);
543       
544        if ($edit_mode eq "add") {
545        $self->{'num_processed_bytes'} += length ($new_text);
546        $text .= "$new_text";
547        }
548        elsif ($edit_mode eq "update") {
549        $text .= "$new_text";
550        }
551        elsif ($edit_mode eq "delete") {
552        $self->{'num_processed_bytes'} -= length ($new_text);
553        }
554    }
555       
556    # add in end tag if at top-level doc root, or indexing at the section level
557    $text .= $end_sec if ($sec_tag_name ne "");
558
559        $section = $doc_obj->get_next_section($section);
560    } # while defined section
561
562   
563    # only output if working with doc level
564    $text .= $end_doc if ($sec_tag_name eq "");
565
566##    $text .= "<commit/>\n";
567
568    print $solrhandle $text;
569
570}
571
572
573
574
575sub textreindex
576{
577    my $self = shift (@_);
578    my ($doc_obj,$file) = @_;
579
580    $self->textedit($doc_obj,$file,"update");
581}
582
583
5841;
585
586
Note: See TracBrowser for help on using the browser.