Changeset 3542
- Timestamp:
- 2002-11-21T10:33:13+13:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/HBPlug.pm
r3540 r3542 44 44 use ghtml; 45 45 use BasPlug; 46 use unicode; 46 47 use util; 47 48 use doc; … … 111 112 close FILE; 112 113 } 113 } 114 114 # text is in utf8 115 } 116 117 # converts the text to utf8, as ghtml does that for é etc. 115 118 sub HB_gettext { 116 119 my $self = shift (@_); … … 139 142 $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags 140 143 141 # convert any alphanumeric character entities to their extended142 # ascii equivalent for indexing purposes143 &ghtml::convertcharentities ($line);144 145 144 $$text .= $line; 146 145 } 146 # 147 if ($self->{'input_encoding'} eq "iso_8859_1") { 148 # convert to utf-8 149 $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text)); 150 } 151 # convert any alphanumeric character entities to their utf-8 152 # equivalent for indexing purposes 153 &ghtml::convertcharentities ($$text); 154 147 155 $$text =~ s/\s+/ /g; # remove \n's 148 156 } … … 205 213 my ($doc_obj, $cursection, $field, $value) = @_; 206 214 207 if ($self->{'input_encoding'} eq "ascii") { 208 $doc_obj->add_utf8_metadata ($cursection, $field, $value); 209 } else { 210 $doc_obj->add_metadata ($cursection, $field, $value); 211 } 215 # All text should now be in utf-8 216 # if ($self->{'input_encoding'} eq "ascii") { 217 $doc_obj->add_utf8_metadata ($cursection, $field, $value); 218 # } else { 219 # $doc_obj->add_metadata ($cursection, $field, $value); 220 # } 212 221 } 213 222 … … 234 243 my $html = ""; 235 244 $self->HB_read_html_file ($htmlfile, \$html); 245 # html is in utf8 236 246 237 247 # create a new document … … 305 315 306 316 # add the text for this section 307 if ($self->{'input_encoding'} eq "ascii") { 308 $doc_obj->add_utf8_text ($cursection, $sectiontext); 309 } else { 310 $doc_obj->add_text ($cursection, $sectiontext); 311 } 317 # All read text should now be in utf-8 318 # if ($self->{'input_encoding'} eq "ascii") { 319 $doc_obj->add_utf8_text ($cursection, $sectiontext); 320 # } else { 321 # $doc_obj->add_text ($cursection, $sectiontext); 322 # } 312 323 } else { 313 324 print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
Note:
See TracChangeset
for help on using the changeset viewer.