Changeset 12426
- Timestamp:
- 2006-08-09T15:54:41+12:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r12424 r12426 56 56 # Unlike MG and MGPP, Lucene supports incremental building 57 57 return 1; 58 }59 60 61 sub preprocess_text {62 my $self = shift (@_);63 my ($text, $strip_html, $para) = @_;64 65 # call the mgpp method first66 my ($new_text) = $self->SUPER::preprocess_text($text, $strip_html, $para);67 68 # remove entities69 $new_text =~ s/&\w{1,10};//g;70 # remove &71 $new_text =~ s/&//g;72 73 return $new_text;74 58 } 75 59 … … 196 180 } 197 181 } 198 199 182 } 200 183 else { … … 215 198 my $section_text = $doc_obj->get_text($section); 216 199 if ($self->{'indexing_text'}) { 217 # tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed200 # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML 218 201 $new_text .= "$parastarttag<$shortname index=\"1\">\n"; 219 202 if ($parastarttag ne "") { 220 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");203 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">"); 221 204 } 222 205 else { 223 206 # we don't want to individually tag each paragraph if not doing para indexing 224 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");207 $section_text = $self->preprocess_text($section_text, 1, ""); 225 208 } 226 209 $new_text .= "$section_text</$shortname>$paraendtag\n"; … … 245 228 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 246 229 } 247 248 # remove entities249 $new_text =~ s/&\w{1,10};//g;250 # remove &251 $new_text =~ s/&//g;252 230 } 253 231
Note:
See TracChangeset
for help on using the changeset viewer.