root/main/trunk/greenstone2/perllib/lucenebuildproc.pm @ 24404

Revision 24404, 18.6 KB (checked in by ak19, 8 years ago)

Changes to perl code to do with removing the ex. prefix: ex. is only removed if it is the sole prefix (i.e. ex.dc.* prefixes are not removed).

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuildproc.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuildproc;
27
28# This document processor outputs a document
29# for lucene to process
30
31# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
32
33use mgppbuildproc;
34use ghtml;
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38
39use IncrementalBuildUtils;
40
41sub BEGIN {
42    @lucenebuildproc::ISA = ('mgppbuildproc');
43}
44
45
46sub new {
47    my $class = shift @_;
48    my $self = new mgppbuildproc (@_);
49
50    $self->{'numincdocs'} = 0;
51
52    return bless $self, $class;
53}
54
55
56sub is_incremental_capable
57{
58    my $self = shift (@_);
59
60    # Unlike MG and MGPP, Lucene supports incremental building
61    return 1;
62}
63
64
65sub textedit {
66    my $self = shift (@_);
67    my ($doc_obj,$file,$edit_mode) = @_;
68
69    my $lucenehandle = $self->{'output_handle'};
70    my $outhandle = $self->{'outhandle'};
71
72    # only output this document if it is one to be indexed
73    return if ($doc_obj->get_doc_type() ne "indexed_doc");
74
75    # skip this document if in "compress-text" mode and asked to delete it
76    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
77
78    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
79
80    # this is another document
81    if (($edit_mode eq "add") || ($edit_mode eq "update")) {
82    $self->{'num_docs'} += 1;
83    }
84    else {
85    $self->{'num_docs'} -= 1;
86    }
87
88    # get the parameters for the output
89    # split on : just in case there is subcoll and lang stuff
90    my ($fields) = split (/:/, $self->{'index'});
91
92    my $doc_tag_name = $mgppbuildproc::level_map{'document'};
93
94    my $levels = $self->{'levels'};
95    my $ldoc_level = $levels->{'document'};
96    my $lsec_level = $levels->{'section'};
97
98    # gs2_id should be depricated #####
99    my $gs2_id = "";
100    if ($ldoc_level)
101    {
102    if ($self->{'db_level'} eq 'document')
103    {
104        $gs2_id = $self->{'num_docs'};
105    }
106        else
107    {
108        # default is section level
109        $gs2_id = $self->{'num_sections'} + 1;
110    }
111    }
112    my $gs2_docOID = $doc_obj->get_OID();
113    my $documenttag = undef;
114    my $documentendtag = undef;
115
116    #$documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\" gs2:id=\"$gs2_id\" gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
117    $documenttag = "<$doc_tag_name xmlns:gs2=\"http://www.greenstone.org/gs2\" file=\"$file\"  gs2:docOID=\"$gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
118    $documentendtag = "\n</$doc_tag_name>\n";
119
120    my $sec_tag_name = "";
121    if ($lsec_level)
122    {
123    $sec_tag_name = $mgppbuildproc::level_map{'section'};
124    }
125
126    my $doc_section = 0; # just for this document
127
128    my $text = "";
129    $text .= $documenttag;
130    # get the text for this document
131    my $section = $doc_obj->get_top_section();
132    while (defined $section)
133    {
134    # update a few statistics
135    $doc_section++;
136    $self->{'num_sections'}++;
137
138    my $sec_gs2_id = $self->{'num_sections'};
139    my $sec_gs2_docOID = $gs2_docOID;
140    $sec_gs2_docOID .= ".$section" if ($section ne "");
141
142    # if we are doing subcollections, then some docs shouldn't be indexed.
143    # but we need to put the section tag placeholders in there so the
144    # sections match up with database
145    my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section";
146    if (($indexed_doc == 0) || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) {
147        if ($sec_tag_name ne "") {
148        #$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
149        $text .= "\n<$sec_tag_name  gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"ignore\">\n";
150        $text .= "\n</$sec_tag_name>\n"
151        }
152            $section = $doc_obj->get_next_section($section);
153        next;
154          }
155
156    if ($sec_tag_name ne "")
157    {
158        #$text .= "\n<$sec_tag_name gs2:id=\"$sec_gs2_id\" gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
159        $text .= "\n<$sec_tag_name  gs2:docOID=\"$sec_gs2_docOID\" gs2:mode=\"$edit_mode\">\n";
160    }
161
162    if (($edit_mode eq "add") || ($edit_mode eq "update")) {
163        $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
164    }
165    else {
166        # delete
167        $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
168    }
169
170
171    # has the user added a 'metadata' index?
172    my $all_metadata_specified = 0;
173    # which fields have already been indexed? (same as fields, but in a map)
174    my $specified_fields = {};
175   
176    # do we have an allfields index??
177    my $allfields_index = 0;
178    # collect up all the text for it in here
179    my $allfields_text = "";
180    foreach my $field (split (/;/, $fields)) {
181        if ($field eq "allfields") {
182        $allfields_index = 1;
183        } elsif ($field eq "metadata") {
184        $all_metadata_specified = 1;
185        }
186    }
187   
188    foreach my $field (split (/;/, $fields)) {
189       
190        # only deal with this field if it doesn't start with top or
191        # this is the first section
192        my $real_field = $field;
193        next if (($real_field =~ s/^top//) && ($doc_section != 1));
194       
195        # process these two later
196        next if ($real_field eq "allfields" || $real_field eq "metadata");
197       
198        #individual metadata and or text specified - could be a comma separated list
199        $specified_fields->{$real_field} = 1;
200        my $shortname="";
201        my $new_field = 0; # have we found a new field name?
202        if (defined $self->{'indexfieldmap'}->{$real_field}) {
203        $shortname = $self->{'indexfieldmap'}->{$real_field};
204        }
205        else {
206        $shortname = $self->create_shortname($real_field);
207        $new_field = 1;
208        }
209
210        my @metadata_list = (); # put any metadata values in here
211        my $section_text = ""; # put the text in here
212        foreach my $submeta (split /,/, $real_field) {
213        if ($submeta eq "text") {
214            # no point in indexing text more than once
215            if ($section_text eq "") {
216            $section_text = $doc_obj->get_text($section);
217            if ($self->{'indexing_text'}) {
218                # we always strip html
219                $section_text = $self->preprocess_text($section_text, 1, "");
220            }
221            else {
222                # leave html stuff in, but escape the tags
223                &ghtml::htmlsafe($section_text);
224            }
225            }
226        }
227        else {
228            $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
229
230            # its a metadata element
231            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
232            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
233            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
234                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
235            }
236            }
237            push (@metadata_list, @section_metadata);
238        }
239        } # for each field in this one index
240       
241        # now we add the text and/or metadata into new_text
242        if ($section_text ne "" || scalar(@metadata_list)) {
243        my $new_text = "";
244       
245        if ($section_text ne "") {
246            $new_text .= "$section_text ";
247        }
248       
249        foreach my $item (@metadata_list) {
250            &ghtml::htmlsafe($item);
251            $new_text .= "$item ";
252        }
253
254        if ($allfields_index) {
255            $allfields_text .= $new_text;
256        }
257
258        if ($self->{'indexing_text'}) {
259            # add the tag
260            $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
261        }
262        # filter the text
263        $new_text = $self->filter_text ($field, $new_text);
264
265        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
266            $self->{'num_processed_bytes'} += length ($new_text);
267            $text .= "$new_text";
268        }
269        else {
270            # delete
271            $self->{'num_processed_bytes'} -= length ($new_text);
272        }
273       
274
275        if ($self->{'indexing_text'} && $new_field) {
276            # we need to add to the list in indexfields
277           
278            $self->{'indexfieldmap'}->{$real_field} = $shortname;
279            $self->{'indexfieldmap'}->{$shortname} = 1;
280        }
281       
282        }
283       
284    } # foreach field
285
286
287    if ($all_metadata_specified) {
288       
289        my $new_text = "";
290        my $shortname = "";
291        my $metadata = $doc_obj->get_all_metadata ($section);
292        foreach my $pair (@$metadata) {
293        my ($mfield, $mvalue) = (@$pair);
294        # no value
295        next unless defined $mvalue && $mvalue ne "";
296        # we have already indexed this
297        next if defined ($specified_fields->{$mfield});
298        # check fields here, maybe others dont want - change to use dontindex!!
299        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
300        next if ($mfield =~ /^gsdl/);
301       
302        &ghtml::htmlsafe($mvalue);
303       
304        if (defined $self->{'indexfieldmap'}->{$mfield}) {
305            $shortname = $self->{'indexfieldmap'}->{$mfield};
306        }
307        else {
308            $shortname = $self->create_shortname($mfield);
309            $self->{'indexfieldmap'}->{$mfield} = $shortname;
310            $self->{'indexfieldmap'}->{$shortname} = 1;
311        }     
312        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
313        if ($allfields_index) {
314            $allfields_text .= "$mvalue ";
315        }
316
317        if (!defined $self->{'indexfields'}->{$mfield}) {
318            $self->{'indexfields'}->{$mfield} = 1;
319        }                   
320       
321        }
322        # filter the text
323        $new_text = $self->filter_text ("metadata", $new_text);
324       
325        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
326        $self->{'num_processed_bytes'} += length ($new_text);
327        $text .= "$new_text";
328        }
329        else {
330        # delete
331        $self->{'num_processed_bytes'} -= length ($new_text);
332        }       
333    }
334
335    if ($allfields_index) {
336        # add the index name mapping
337        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
338        $self->{'indexfieldmap'}->{"ZZ"} = 1;
339       
340        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
341        # filter the text
342        $new_text = $self->filter_text ("allfields", $new_text);
343       
344        if (($edit_mode eq "add") || ($edit_mode eq "update")) {
345        $self->{'num_processed_bytes'} += length ($new_text);
346        $text .= "$new_text";
347        }
348        else {
349        # delete
350        $self->{'num_processed_bytes'} -= length ($new_text);
351        }
352    }
353       
354    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
355
356        $section = $doc_obj->get_next_section($section);
357    } # while defined section
358
359    print $lucenehandle "$text\n$documentendtag";
360
361##    if ($edit_mode eq "delete") {
362##       print STDERR "$text\n$documentendtag";
363##    }
364
365}
366
367sub text {
368    my $self = shift (@_);
369    my ($doc_obj,$file) = @_;
370
371    $self->textedit($doc_obj,$file,"add");
372}
373
374sub textreindex
375{
376    my $self = shift (@_);
377    my ($doc_obj,$file) = @_;
378
379    $self->textedit($doc_obj,$file,"update");
380}
381
382sub textdelete
383{
384    my $self = shift (@_);
385    my ($doc_obj,$file) = @_;
386
387    $self->textedit($doc_obj,$file,"delete");
388}
389
390
391
392
393
394# /** We make this builder pretend to be a document processor so we can get
395#  *  information back from the plugins.
396#  *
397#  *  @param  $self    A reference to this Lucene builder
398#  *  @param  $doc_obj A reference to a document object representing what was
399#  *                   parsed by the GAPlug
400#  *  @param  $file    The name of the file parsed as a string
401#  *
402#  *  @author John Thompson, DL Consulting Ltd
403#  */
404sub process()
405  {
406    my $self = shift (@_);
407    my ($doc_obj, $file) = @_;
408
409    # If this is called from any stage other than an incremental infodb we want
410    # to pass through to the superclass of build
411    if ($self->get_mode() eq "incinfodb")
412      {
413        print STDERR "*** Processing a document added using INCINFODB ***\n" if ($self->{'verbosity'} > 3);
414        my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
415        $archivedir = "" unless defined $archivedir;
416        $archivedir =~ s/\\/\//g;
417        $archivedir =~ s/^\/+//;
418        $archivedir =~ s/\/+$//;
419
420        # Number of files
421        print STDERR "There are " . scalar(@{$doc_obj->get_assoc_files()}) . " associated documents...\n" if ($self->{'verbosity'} > 3);
422
423        # resolve the final filenames of the files associated with this document
424        $self->assoc_files ($doc_obj, $archivedir);
425
426        # is this a paged or a hierarchical document
427        my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
428
429        # Determine the actual docnum by checking if we've processed any
430        # previous incrementally added documents. If so, carry on from there.
431        # Otherwise we set the counter to be the same as the number of
432        # sections encountered during the previous build
433        if ($self->{'numincdocs'} == 0)
434          {
435            $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1;
436          }
437
438        my $section = $doc_obj->get_top_section ();
439        print STDERR "+ top section: '$section'\n" if ($self->{'verbosity'} > 3);
440        my $doc_OID = $doc_obj->get_OID();
441        my $url = "";
442        while (defined $section)
443          {
444            print STDERR "+ processing section: '$section'\n" if ($self->{'verbosity'} > 3);
445            # Attach all the other metadata to this document
446            # output the fact that this document is a document (unless doctype
447            # has been set to something else from within a plugin
448            my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
449            if (!defined $dtype || $dtype !~ /\w/)
450              {
451                #$doc_obj->add_utf8_metadata($section, "doctype", $dtype);
452          $doc_obj->add_utf8_metadata($section, "doctype", "doc");
453              }
454            # output whether this node contains text
455            if ($doc_obj->get_text_length($section) > 0)
456              {
457                $doc_obj->add_utf8_metadata($section, "hastxt", 1);
458              }
459            else
460              {
461                $doc_obj->add_utf8_metadata($section, "hastxt", 0);
462              }
463
464            # output archivedir if at top level
465            if ($section eq $doc_obj->get_top_section())
466              {
467                $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir);
468        $doc_obj->add_utf8_metadata($section, "thistype", $thistype);
469              }
470
471            # output a list of children
472            my $children = $doc_obj->get_children ($section);
473            if (scalar(@$children) > 0)
474              {
475                $doc_obj->add_utf8_metadata($section, "childtype", $childtype);
476                my @contains = ();
477                foreach my $child (@$children)
478                  {
479                    if ($child =~ /^.*?\.(\d+)$/)
480                      {
481                        push (@contains, "\".$1");
482                      }
483                    else
484                      {
485                        push (@contains, "\".$child");
486                      }
487                  }
488                $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains));
489              }
490            #output the matching doc number
491            print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n" if ($self->{'verbosity'} > 3);
492            $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'});
493
494            $self->{'numincdocs'}++;
495            $section = $doc_obj->get_next_section($section);
496            # if no sections wanted, only add the docs
497            last if ($self->{'db_level'} eq "document");
498          }
499        print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n" if ($self->{'verbosity'} > 3);
500        &IncrementalBuildUtils::addDocument($self->{'collection'}, $self->{'infodbtype'}, $doc_obj, $doc_obj->get_top_section());
501      }
502    else
503      {
504        $self->mgppbuildproc::process(@_);
505      }
506  }
507# /** process() **/
508
509
510# Following methods seem to be no different to those defined in basebuildproc.pm
511# From inspection, it looks like these ones can be removed
512
513
514sub get_num_docs {
515    my $self = shift (@_);
516    #rint STDERR "get_num_docs(): $self->{'num_docs'}\n";
517    return $self->{'num_docs'};
518}
519
520sub get_num_sections {
521    my $self = shift (@_);
522    #rint STDERR "get_num_sections(): $self->{'num_sections'}\n";
523    return $self->{'num_sections'};
524}
525
526# num_bytes is the actual number of bytes in the collection
527# this is normally the same as what's processed during text compression
528sub get_num_bytes {
529    my $self = shift (@_);
530    #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n";
531    return $self->{'num_bytes'};
532}
533
534
535# This is similar to mgppbuildproc's preprocess_text but adds extra spaces
536# Otherwise the removal of tags below might lead to Lucene turning
537#   "...farming</p>\n<p>EDWARD.." into "farmingedward"
538#     (example from demo collection b20cre)
539# Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com)
540sub preprocess_text
541{
542    my $self = shift (@_);
543    my ($text, $strip_html, $para) = @_;
544    # at this stage, we do not do paragraph tags unless have strip_html -
545    # it will result in a huge mess of non-xml
546    return unless $strip_html;
547
548    my $new_text = $text;
549
550    # if we have <pre> tags, we can have < > inside them, need to delete
551    # the <> before stripping tags
552    $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse;
553
554    if ($para eq "") {
555       # just remove all tags
556       $new_text =~ s/<[^>]*>/ /gs;
557    } else {
558       # strip all tags except <p> tags which get turned into $para
559       $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse;
560    }
561
562    # It's important that we remove name entities because otherwise the text passed to Lucene for indexing
563    #   may not be valid XML (eg. if HTML-only entities like &nbsp; are used)
564    $new_text =~ s/&\w{1,10};//g;
565    # Remove stray '&' characters, except in &#nnnn; or &#xhhhh; entities (which are valid XML)
566    $new_text =~ s/&([^\#])/ $1/g;
567
568    return $new_text;
569}
570
571sub delete_assoc_files
572{
573    my $self = shift (@_);
574    my ($archivedir, $edit_mode) = @_;
575
576    $self->basebuildproc::delete_assoc_files(@_);
577   
578    if ($edit_mode eq "delete") {
579    # if we are deleting the doc, then also delete the lucene text  version
580    my $assoc_dir = &util::filename_cat($self->{'build_dir'},"text", $archivedir);
581    if (-d $assoc_dir) {
582        &util::rm_r($assoc_dir);
583    }
584    }
585}
5861;
587
588
Note: See TracBrowser for help on using the browser.