Changeset 12426


Ignore:
Timestamp:
2006-08-09T15:54:41+12:00 (18 years ago)
Author:
mdewsnip
Message:

Deleted the code for removing entities, since it seemed to be negatively helpful (and done twice in many situations). When compressing the text, htmlsafe is called on the section text, so the XML will be valid in this case. When indexing the text, the HTML tags are stripped out ('strip_html' is always set for Lucene), so there is no problem in this case either.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuildproc.pm

    r12424 r12426  
    5656    # Unlike MG and MGPP, Lucene supports incremental building
    5757    return 1;
    58 }
    59 
    60 
    61 sub preprocess_text {
    62     my $self = shift (@_);
    63     my ($text, $strip_html, $para) = @_;
    64 
    65     # call the mgpp method first
    66     my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);
    67 
    68     # remove entities
    69     $new_text =~ s/&\w{1,10};//g;
    70     # remove &
    71     $new_text =~ s/&//g;
    72    
    73     return $new_text;
    7458}
    7559
     
    196180            }
    197181        }
    198            
    199182        }
    200183        else {
     
    215198            my $section_text = $doc_obj->get_text($section);
    216199            if ($self->{'indexing_text'}) {
    217                             # tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
     200                            # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
    218201                $new_text .= "$parastarttag<$shortname index=\"1\">\n";
    219202                if ($parastarttag ne "") {
    220                 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
     203                $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
    221204                }
    222205                else {
    223206                # we don't want to individually tag each paragraph if not doing para indexing
    224                 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
     207                $section_text = $self->preprocess_text($section_text, 1, "");
    225208                }
    226209                $new_text .= "$section_text</$shortname>$paraendtag\n";
     
    245228            $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
    246229        }
    247 
    248         # remove entities
    249         $new_text =~ s/&\w{1,10};//g;
    250         # remove &
    251         $new_text =~ s/&//g;
    252230        }
    253231       
Note: See TracChangeset for help on using the changeset viewer.