Changeset 9186
- Timestamp:
- 2005-02-25T14:27:16+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r9178 r9186 32 32 33 33 use mgppbuildproc; 34 34 use ghtml; 35 35 36 36 sub BEGIN { … … 44 44 45 45 return bless $self, $class; 46 } 47 48 sub preprocess_text { 49 my $self = shift (@_); 50 my ($text, $strip_html, $para) = @_; 51 52 my ($outtext) = ""; 53 if ($strip_html) { 54 while ($text =~ /<([^>]*)>/ && $text ne "") { 55 56 my $tag = $1; 57 $outtext .= $`." "; #add everything before the matched tag 58 $text = $'; #'everything after the matched tag 59 if ($para && $tag =~ /^\s*p\s/i) { 60 $outtext .= $para; 61 } 62 elsif ($tag =~ /^pre$/) { # a pre tag 63 $text =~ /<\/pre>/; # find the closing pre tag 64 my $tmp_text = $`; #everything before the closing pre tag 65 $text = $'; #'everything after the </pre> 66 $tmp_text =~ s/[<>]//g; # remove all < and > 67 $outtext.= $tmp_text . " "; 68 } 69 } 70 71 $outtext .= $text; # add any remaining text 72 } #if strip_html 73 else { 74 $outtext = $text; 75 } 76 #if ($para) { 77 #$text =~ s/(<p\b)/$para$1/gi; 78 #return $text; 79 #} 80 81 # remove entities 82 $outtext =~ s/&\w{1,10};//g; 83 84 return $outtext; 46 85 } 47 86 … … 102 141 my $docid=""; 103 142 if ($ldoc_level) { 104 my $doc_sec_num = $self->{'num_sections'}+1; 105 $docid = "gs2:id=\"$doc_sec_num\""; 143 if ($lsec_level) { 144 145 my $doc_sec_num = $self->{'num_sections'}+1; 146 $docid = "gs2:id=\"$doc_sec_num\""; 147 } else { 148 my $doc_sec_num = $self->{'num_docs'}; 149 $docid = "gs2:id=\"$doc_sec_num\""; 150 } 106 151 } 107 152 … … 175 220 if ($self->{'store_text'}) { 176 221 $tmp_text .= $doc_obj->get_text ($section); 177 $tmp_text =~ s/</</g; 178 $tmp_text =~ s/>/>/g; 222 &ghtml::htmlsafe($tmp_text); 179 223 $new_text .= $tmp_text; 180 224 } … … 227 271 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) { 228 272 $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n"; 273 # remove entities 274 $new_text =~ s/&\w{1,10};//g; 229 275 } 230 276 } … … 234 280 # filter the text 235 281 $self->filter_text ($field, $new_text); 236 237 282 $self->{'num_processed_bytes'} += length ($new_text); 238 283 $text .= "$new_text"; … … 246 291 } #while defined section 247 292 print $handle "$text\n$documentendtag"; 248 249 293 } 250 294
Note:
See TracChangeset
for help on using the changeset viewer.