Changeset 14068
- Timestamp:
- 2007-05-14T11:34:42+12:00 (17 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r12951 r14068 418 418 } 419 419 420 421 # This is similar to mgppbuildproc's preprocess_text but adds extra spaces 422 # Otherwise the removal of tags below might lead to Lucene turning 423 # "...farming</p>\n<p>EDWARD.." into "farmingedward" 424 # (example from demo collection b20cre) 425 # Many thanks to John Thompson, DL Consulting Ltd. (www.dlconsulting.com) 426 sub preprocess_text 427 { 428 my $self = shift (@_); 429 my ($text, $strip_html, $para) = @_; 430 # at this stage, we do not do paragraph tags unless have strip_html - 431 # it will result in a huge mess of non-xml 432 return unless $strip_html; 433 434 my $new_text = $text; 435 436 # if we have <pre> tags, we can have < > inside them, need to delete 437 # the <> before stripping tags 438 $new_text =~ s/<pre>(.*?)<\/pre>/$self->remove_gtlt($1,$para)/gse; 439 440 if ($para eq "") { 441 # just remove all tags 442 $new_text =~ s/<[^>]*>/ /gs; 443 } else { 444 # strip all tags except <p> tags which get turned into $para 445 $new_text =~ s/<([^>]*)>/$self->process_tags($1, $para)/gse; 446 } 447 448 return $new_text; 449 } 450 451 420 452 1; 421 453
Note:
See TracChangeset
for help on using the changeset viewer.