root/main/trunk/greenstone2/perllib/lucenebuildproc.pm @ 33327

Revision 33327, 22.1 KB (checked in by ak19, 2 months ago)

In order to get map coordinate metadata stored correctly in solr, changes were required. These changes revealed that the way in which some index fields were stored in solr but also lucene were not exactly correct and required changing too. 1. Coordinate/CD, CoordShort?/CS and GPSMapOverlayLabel/ML meta are now being stored. The schema elements created for these indexed fields notably need to say they're multivalued (multiple values per docOID) and are of type=string rather than type=text_en_splitting as the other meta have been so far. No term related information being stored for them as that doesn't appear important for these indexed fields. 2. Changes to solrbuildproc required and these changes were also repeated into lucenebuildproc: in their code before this commit, <field name=... /> elements were stored once for all meta elements in that field. It sort of worked out so far since the type=text_en_splitting for these fields. This however created the problem that for example all Coordinate meta for a docOID went into a single <field name=CD .../> element separate by spaces rather than a <field name=CD .../> element for each Coordinate meta. We wanted the latter behaviour for CD, CS and ML meta but also for all other indexed meta fields such as TI for titles. But also for indexed fields that include multiple meta in one index such as a hypothetical TT where TT would include dc.Title,ex.Title,text. In that case too we want a <field name=TT /> element for each title meta and for the text meta. 3. The num_processed_bytes calculation is left untouched and still includes the encapsulating <field name=.../> element and has not been changed to be calculated over just the meta data value of each field. This is because not only is it calculated to include the field in super -buildproc.pm classes, but also because the definition of num_processed_bytes in basebuilder.pm is defined as the number of bytes actually passed to (mg) for the current index, where lucene and mgpp buildprocs both include the enclosing element in the calculation which seems deliberate. Further, num_processed_bytes contrasts against num_bytes, declared and defined in basebuildproc.pm too as The actual number of bytes in the collection, normally the same as what's processed during text compression. num_bytes seems to be what Dr Bainbridge had in mind today when he said that actually the enclosing <field/> element shouldn't be included in the calculation of num_processed_bytes. Since the definition of num_processed_bytes seems ambiguous to me now, I leave it alone until discussed with Dr Bainbridge again, as there are many places where it needs changing otherwise.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40use FileUtils;
41
42sub BEGIN {
43    @lucenebuildproc::ISA = ('mgppbuildproc');
44}
45
46
47sub new {
48    my $class = shift @_;
49    my $self = new mgppbuildproc (@_);
50
51    $self->{'numincdocs'} = 0;
52    $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
53    $self->{'allfields_index'} = 0; # do we need allfields index?
54    $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
55    $self->{'actualsortfields'} = {}; # sort fields that have actually been used
56    $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
57    return bless $self, $class;
58}
59
60sub set_index {
61    my $self = shift (@_);
62    my ($index, $indexexparr) = @_;
63
64    $self->mgppbuildproc::set_index($index, $indexexparr);
65   
66    # just get the list of index fields without any subcoll stuff
67    my ($fields) = split (/:/, $self->{'index'});
68
69    foreach my $field (split (/;/, $fields)) {
70    if ($field eq "allfields") {
71        $self->{'allfields_index'} = 1;
72    } elsif ($field eq "metadata") {
73        $self->{'all_metadata_specified'} = 1;
74    } else {
75        $field =~ s/^top//;
76        $self->{'specified_fields'} ->{$field} = 1;
77    }
78    }   
79}
80
81sub set_sections_sort_on_document_metadata {
82    my $self= shift (@_);
83    my ($index_type) = @_;
84   
85    $self->{'sections_sort_on_document_metadata'} = $index_type;
86}
87
88sub set_sortfields {
89    my $self = shift (@_);
90 
91    my ($sortfields) = @_;
92    $self->{'sortfields'} = ();
93    # lets just go through and check for text, allfields, metadata which are only valid for indexes, not for sortfields
94    foreach my $s (@$sortfields) {
95    if ($s !~ /^(text|allfields|metadata)$/) {
96        push (@{$self->{'sortfields'}}, $s);
97    }
98    }
99}
100
101sub is_incremental_capable
102{
103    my $self = shift (@_);
104
105    # Unlike MG and MGPP, Lucene supports incremental building
106    return 1;
107}
108
109
110sub textedit {
111    my $self = shift (@_);
112    my ($doc_obj,$file,$edit_mode) = @_;
113
114    my $lucenehandle = $self->{'output_handle'};
115    my $outhandle = $self->{'outhandle'};
116
117    # only output this document if it is one to be indexed
118    return if ($doc_obj->get_doc_type() ne "indexed_doc");
119
120    # skip this document if in "compress-text" mode and asked to delete it
121    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
122
123    # 0/1 to indicate whether this doc is part of the specified subcollection
124    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
125
126    # this is another document
127    if (($edit_mode eq "add") || ($edit_mode eq "update")) {
128    $self->{'num_docs'} += 1;
129    }
130    else {
131    $self->{'num_docs'} -= 1;
132    }
133
134
135    # get the parameters for the output
136    # split on : just in case there is subcoll and lang stuff
137    my ($fields) = split (/:/, $self->{'index'});
138
139    my $doc_tag_name = $mgppbuildproc::level_map{'document'};
140
141    my $levels = $self->{'levels'};
142    my $ldoc_level = $levels->{'document'};
143    my $lsec_level = $levels->{'section'};
144
145    my $gs2_docOID = $doc_obj->get_OID();
146    my $documenttag = undef;
147    my $documentendtag = undef;
148
149    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\"  gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
150    $documentendtag = "\n</$doc_tag_name>\n";
151
152    my $sec_tag_name = "";
153    if ($lsec_level)
154    {
155    $sec_tag_name = $mgppbuildproc::level_map{'section'};
156    }
157
158    my $doc_section = 0; # just for this document
159
160    my $text = "";
161    $text .= $documenttag;
162    # get the text for this document
163    my $section = $doc_obj->get_top_section();
164    while (defined $section)
165    {
166    # update a few statistics
167    $doc_section++;
168    $self->{'num_sections'}++;
169
170    my $sec_gs2_id = $self->{'num_sections'};
171    my $sec_gs2_docOID = $gs2_docOID;
172    $sec_gs2_docOID .= ".$section" if ($section ne "");
173
174    # if we are doing subcollections, then some docs shouldn't be indexed.
175    # but we need to put the section tag placeholders in there so the
176    # sections match up with database
177    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
178    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
179        if ($sec_tag_name ne "") {
180        $text .= "\n<$sec_tag_name  gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
181        $text .= "\n</$sec_tag_name>\n"
182        }
183            $section = $doc_obj->get_next_section($section);
184        next;
185          }
186
187    if ($sec_tag_name ne "")
188    {
189        $text .= "\n<$sec_tag_name  gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
190    }
191
192    if (($edit_mode eq "add") || ($edit_mode eq "update")) {
193        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
194    }
195    else {
196        # delete
197        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
198    }
199
200
201    # collect up all the text for allfields index in here (if there is one)
202    my $allfields_text = "";
203
204    foreach my $field (split (/;/, $fields)) {
205       
206        # only deal with this field if it doesn't start with top or
207        # this is the first section
208        my $real_field = $field;
209        next if (($real_field =~ s/^top//) && ($doc_section != 1));
210       
211        # process these two later
212        next if ($real_field eq "allfields" || $real_field eq "metadata");
213       
214        #individual metadata and or text specified - could be a comma separated list
215        #$specified_fields->{$real_field} = 1;
216        my $shortname="";
217        my $new_field = 0; # have we found a new field name?
218        if (defined $self->{'fieldnamemap'}->{$real_field}) {
219        $shortname = $self->{'fieldnamemap'}->{$real_field};
220        } else {
221        $shortname = $self->create_shortname($real_field);
222        $self->{'fieldnamemap'}->{$real_field} = $shortname;
223        $self->{'fieldnamemap'}->{$shortname} = 1;
224        }
225        my @metadata_list = (); # put any metadata values in here
226        my $section_text = ""; # put the text in here
227        foreach my $submeta (split /,/, $real_field) {
228        if ($submeta eq "text") {
229            # no point in indexing text more than once
230            if ($section_text eq "") {
231            $section_text = $doc_obj->get_text($section);
232            if ($self->{'indexing_text'}) {
233                # we always strip html
234                $section_text = $self->preprocess_text($section_text, 1, "");
235            }
236            else {
237                # leave html stuff in, but escape the tags
238                &ghtml::htmlsafe($section_text);
239            }
240            }
241        }
242        else {
243            $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
244
245            # its a metadata element
246            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
247            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
248            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
249                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
250            }
251            }
252            push (@metadata_list, @section_metadata);
253        }
254        } # for each field in this one index
255       
256
257        # now we add the text and/or metadata into new_text
258        if ($section_text ne "" || scalar(@metadata_list)) {
259        my $new_text = "";
260       
261        if ($section_text ne "") {
262           
263            if ($self->{'allfields_index'}) {
264                $allfields_text .= "$section_text ";
265            }
266           
267            if ($self->{'indexing_text'}) {
268                # add the tag
269                $new_text .= "<$shortname index=\"1\">$section_text</$shortname>";
270                $self->{'allindexfields'}->{$real_field} = 1;
271            } else {
272                $new_text .= "$section_text ";
273            }
274        }
275       
276        foreach my $item (@metadata_list) {
277            &ghtml::htmlsafe($item);
278
279            if ($self->{'allfields_index'}) {
280                $allfields_text .= "$item ";
281            }
282
283            if ($self->{'indexing_text'}) {
284                # add the tag
285                $new_text .= "<$shortname index=\"1\">$item</$shortname>";
286                $self->{'allindexfields'}->{$real_field} = 1;
287            } else {
288                $new_text .= "$item ";
289            }
290        } # end for loop processing @metadata_list
291       
292        # filter the text
293        $new_text = $self->filter_text ($field, $new_text);
294
295        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
296            $self->{'num_processed_bytes'} += length ($new_text);
297            $text .= "$new_text";
298        }
299        else {
300            # delete
301            $self->{'num_processed_bytes'} -= length ($new_text);
302        }       
303        }
304       
305    } # foreach field
306
307    if ($self->{'all_metadata_specified'}) {
308       
309        my $new_text = "";
310        my $shortname = "";
311        my $metadata = $doc_obj->get_all_metadata ($section);
312        foreach my $pair (@$metadata) {
313        my ($mfield, $mvalue) = (@$pair);
314        # no value
315        next unless defined $mvalue && $mvalue ne "";
316        # we have already indexed this
317        next if defined ($self->{'specified_fields'}->{$mfield});
318        # check fields here, maybe others dont want - change to use dontindex!!
319        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
320        next if ($mfield =~ /^gsdl/);
321       
322        &ghtml::htmlsafe($mvalue);
323       
324        if (defined $self->{'fieldnamemap'}->{$mfield}) {
325            $shortname = $self->{'fieldnamemap'}->{$mfield};
326        }
327        else {
328            $shortname = $self->create_shortname($mfield);
329            $self->{'fieldnamemap'}->{$mfield} = $shortname;
330            $self->{'fieldnamemap'}->{$shortname} = 1;
331        }
332        $self->{'allindexfields'}->{$mfield} = 1;
333        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
334        if ($self->{'allfields_index'}) {
335            $allfields_text .= "$mvalue ";
336        }
337
338        if (!defined $self->{'extraindexfields'}->{$mfield}) {
339            $self->{'extraindexfields'}->{$mfield} = 1;
340        }                   
341       
342        }
343        # filter the text
344        $new_text = $self->filter_text ("metadata", $new_text);
345       
346        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
347        $self->{'num_processed_bytes'} += length ($new_text);
348        $text .= "$new_text";
349        }
350        else {
351        # delete
352        $self->{'num_processed_bytes'} -= length ($new_text);
353        }       
354    }
355
356    if ($self->{'allfields_index'}) {
357       
358        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
359        # filter the text
360        $new_text = $self->filter_text ("allfields", $new_text);
361       
362        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
363        $self->{'num_processed_bytes'} += length ($new_text);
364        $text .= "$new_text";
365        }
366        else {
367        # delete
368        $self->{'num_processed_bytes'} -= length ($new_text);
369        }
370    }
371    # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
372    if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
373    # add sort fields if there are any
374       
375    foreach my $sfield (@{$self->{'sortfields'}}) {
376        # ignore special field rank
377        next if ($sfield eq "rank" || $sfield eq "none");
378        my $sf_shortname;
379        if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
380        $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
381        }
382        else {
383        $sf_shortname = $self->create_sortfield_shortname($sfield);
384        $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
385        $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
386        }
387        my @metadata_list = (); # put any metadata values in here
388        foreach my $submeta (split /,/, $sfield) {
389        $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
390       
391        my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
392            if ($section ne $doc_obj->get_top_section() && defined ($self->{'sections_sort_on_document_metadata'})) {
393            if ($self->{'sections_sort_on_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_sort_on_document_metadata'} eq "unless_section_metadata_exists")) {
394                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
395            }
396            }
397        push (@metadata_list, @section_metadata);
398        }
399        # my $new_text = "";
400        # foreach my $item (@metadata_list) {
401        # &ghtml::htmlsafe($item);
402        # $new_text .= "$item"; # should be .="$item "; But will be commenting out and rewriting this entire thing, so it doesn't matter
403        # }
404        # if ($new_text =~ /\S/) {
405        # $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
406        # # filter the text???
407        # $text .= "$new_text"; # add it to the main text block
408        # $self->{'actualsortfields'}->{$sfield} = 1;
409        # }
410       
411        foreach my $item (@metadata_list) {
412            &ghtml::htmlsafe($item);
413            if ($item =~ /\S/) {
414                $item = "<$sf_shortname index=\"1\" tokenize=\"0\">$item</$sf_shortname>";
415                $text .= "$item"; # add it to the main text block
416            }
417        }
418        if(scalar @metadata_list > 0) {
419            $self->{'actualsortfields'}->{$sfield} = 1;
420        }
421    }
422    }
423    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
424
425        $section = $doc_obj->get_next_section($section);
426    } # for each section
427   
428    #open (TEXTOUT, ">text.out");
429    #print TEXTOUT "$text\n$documentendtag";
430    #close TEXTOUT;
431
432    print $lucenehandle "$text\n$documentendtag";
433
434##    if ($edit_mode eq "delete") {
435##       print STDERR "$text\n$documentendtag";
436##    }
437
438}
439
440sub text {
441    my $self = shift (@_);
442    my ($doc_obj,$file) = @_;
443
444    $self->textedit($doc_obj,$file,"add");
445}
446
447sub textreindex
448{
449    my $self = shift (@_);
450    my ($doc_obj,$file) = @_;
451
452    $self->textedit($doc_obj,$file,"update");
453}
454
455sub textdelete
456{
457    my $self = shift (@_);
458    my ($doc_obj,$file) = @_;
459
460    $self->textedit($doc_obj,$file,"delete");
461}
462
463
464
465
466
467# /** We make this builder pretend to be a document processor so we can get
468#  *  information back from the plugins.
469#  *
470#  *  @param  $self    A reference to this Lucene builder
471#  *  @param  $doc_obj A reference to a document object representing what was
472#  *                   parsed by the GAPlug
473#  *  @param  $file    The name of the file parsed as a string
474#  *
475#  *  @author John Thompson, DL Consulting Ltd
476#  */
477sub process()
478  {
479    my $self = shift (@_);
480    my ($doc_obj, $file) = @_;
481
482    # If this is called from any stage other than an incremental infodb we want
483    # to pass through to the superclass of build
484    if ($self->get_mode() eq "incinfodb")
485      {
486        print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
487        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
488        $archivedir = "" unless defined $archivedir;
489        $archivedir =~ s/\\/\//g;
490        $archivedir =~ s/^\/+//;
491        $archivedir =~ s/\/+$//;
492
493        # Number of files
494        print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
495
496        # resolve the final filenames of the files associated with this document
497        $self->assoc_files ($doc_obj, $archivedir);
498
499        # is this a paged or a hierarchical document
500        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
501
502        # Determine the actual docnum by checking if we've processed any
503        # previous incrementally added documents. If so, carry on from there.
504        # Otherwise we set the counter to be the same as the number of
505        # sections encountered during the previous build
506        if ($self->{'numincdocs'} == 0)
507          {
508            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
509          }
510
511        my $section = $doc_obj->get_top_section ();
512        print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
513        my $doc_OID = $doc_obj->get_OID();
514        my $url = "";
515        while (defined $section)
516          {
517            print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
518            # Attach all the other metadata to this document
519            # output the fact that this document is a document (unless doctype
520            # has been set to something else from within a plugin
521            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
522            if (!defined $dtype || $dtype !~ /\w/)
523              {
524                #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
525          $doc_obj->add_utf8_metadata($section, "doctype", "doc");
526              }
527            # output whether this node contains text
528            if ($doc_obj->get_text_length($section) > 0)
529              {
530                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
531              }
532            else
533              {
534                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
535              }
536
537            # output archivedir if at top level
538            if ($section eq $doc_obj->get_top_section())
539              {
540                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
541        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
542              }
543
544            # output a list of children
545            my $children = $doc_obj->get_children ($section);
546            if (scalar(@$children) > 0)
547              {
548                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
549                my @contains = ();
550                foreach my $child (@$children)
551                  {
552                    if ($child =~ /^.*?\.(\d+)$/)
553                      {
554                        push (@contains, "\".$1");
555                      }
556                    else
557                      {
558                        push (@contains, "\".$child");
559                      }
560                  }
561                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
562              }
563            #output the matching doc number
564            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
565            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
566
567            $self->{'numincdocs'}++;
568            $section = $doc_obj->get_next_section($section);
569            # if no sections wanted, only add the docs
570            last if ($self->{'db_level'} eq "document");
571          }
572        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
573        &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
574      }
575    else
576      {
577        $self->mgppbuildproc::process(@_);
578      }
579  }
580# /** process() **/
581
582
583# Following methods seem to be no different to those defined in basebuildproc.pm
584# From inspection, it looks like these ones can be removed
585
586
587sub get_num_docs {
588    my $self = shift (@_);
589    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
590    return $self->{'num_docs'};
591}
592
593sub get_num_sections {
594    my $self = shift (@_);
595    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
596    return $self->{'num_sections'};
597}
598
599# num_bytes is the actual number of bytes in the collection
600# this is normally the same as what's processed during text compression
601sub get_num_bytes {
602    my $self = shift (@_);
603    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
604    return $self->{'num_bytes'};
605}
606
607
608# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
609# Otherwise the removal of tags below might lead to Lucene turning
610#   "...farming</p>\n<p>EDWARD.." into "farmingedward"
611#     (example from demo collection b20cre)
612# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
613sub preprocess_text
614{
615    my $self = shift (@_);
616    my ($text, $strip_html, $para) = @_;
617    # at this stage, we do not do paragraph tags unless have strip_html -
618    # it will result in a huge mess of non-xml
619    return unless $strip_html;
620
621    my $new_text = $text;
622
623    # if we have <pre> tags, we can have < > inside them, need to delete
624    # the <> before stripping tags
625    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
626
627    if ($para eq "") {
628       # just remove all tags
629       $new_text =~ s/<[^>]*>/ /gs;
630    } else {
631       # strip all tags except <p> tags which get turned into $para
632       $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
633    }
634
635    # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
636    #   may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
637    $new_text =~ s/&\w{1,10};//g;
638    # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
639    $new_text =~ s/&([^\#])/ $1/g;
640
641    return $new_text;
642}
643
644sub delete_assoc_files
645{
646    my $self = shift (@_);
647    my ($archivedir, $edit_mode) = @_;
648
649    $self->basebuildproc::delete_assoc_files(@_);
650   
651    if ($edit_mode eq "delete") {
652    # if we are deleting the doc, then also delete the lucene text  version
653    my $assoc_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"text", $archivedir);
654    if (-d $assoc_dir) {
655        &FileUtils::removeFilesRecursive($assoc_dir);
656    }
657    }
658}
659
660sub create_sortfield_shortname {
661    my $self = shift(@_);
662
663    my ($realname) = @_;
664
665    my $index_shortname;
666    # if we have created a shortname for an index on this field, then use it.
667    if (defined $self->{'fieldnamemap'}->{$realname}) {
668    $index_shortname = $self->{'fieldnamemap'}->{$realname};
669    } else {
670    $index_shortname = $self->create_shortname($realname);
671    }
672    return "by".$index_shortname;
673}
674 
675
6761;
677
678
Note: See TracBrowser for help on using the browser.