Changeset 3542


Ignore:
Timestamp:
2002-11-21T10:33:13+13:00 (21 years ago)
Author:
jrm21
Message:

ghtml returns utf8, not iso-8859-1, so any html entities were being converted
twice. We now make the text utf8 first.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/HBPlug.pm

    r3540 r3542  
    4444use ghtml;
    4545use BasPlug;
     46use unicode;
    4647use util;
    4748use doc;
     
    111112    close FILE;
    112113    }
    113 }
    114 
     114    # text is in utf8
     115}
     116
     117# converts the text to utf8, as ghtml does that for é etc.
    115118sub HB_gettext {
    116119    my $self = shift (@_);
     
    139142    $line =~ s/<\/?(body|html|font)\b[^>]*>//ig; # remove any unwanted tags
    140143
    141     # convert any alphanumeric character entities to their extended
    142     # ascii equivalent for indexing purposes
    143     &ghtml::convertcharentities ($line);
    144 
    145144    $$text .= $line;
    146145    }
     146    #
     147    if ($self->{'input_encoding'} eq "iso_8859_1") {
     148    # convert to utf-8
     149    $$text=&unicode::unicode2utf8(&unicode::convert2unicode("iso_8859_1", $text));
     150    }
     151    # convert any alphanumeric character entities to their utf-8
     152    # equivalent for indexing purposes
     153    &ghtml::convertcharentities ($$text);
     154
    147155    $$text =~ s/\s+/ /g; # remove \n's
    148156}
     
    205213    my ($doc_obj, $cursection, $field, $value) = @_;
    206214
    207     if ($self->{'input_encoding'} eq "ascii") {
    208     $doc_obj->add_utf8_metadata ($cursection, $field, $value);
    209     } else {
    210     $doc_obj->add_metadata ($cursection, $field, $value);
    211     }
     215# All text should now be in utf-8
     216#    if ($self->{'input_encoding'} eq "ascii") {
     217    $doc_obj->add_utf8_metadata ($cursection, $field, $value);
     218#    } else {
     219#   $doc_obj->add_metadata ($cursection, $field, $value);
     220#    }
    212221}
    213222
     
    234243    my $html = "";
    235244    $self->HB_read_html_file ($htmlfile, \$html);
     245    # html is in utf8
    236246
    237247    # create a new document
     
    305315
    306316        # add the text for this section
    307         if ($self->{'input_encoding'} eq "ascii") {
    308         $doc_obj->add_utf8_text ($cursection, $sectiontext);
    309         } else {
    310         $doc_obj->add_text ($cursection, $sectiontext);
    311         }
     317# All read text should now be in utf-8
     318#       if ($self->{'input_encoding'} eq "ascii") {
     319        $doc_obj->add_utf8_text ($cursection, $sectiontext);
     320#       } else {
     321#       $doc_obj->add_text ($cursection, $sectiontext);
     322#       }
    312323    } else {
    313324        print $outhandle "WARNING - leftover text\n" , $self->shorten($html),
Note: See TracChangeset for help on using the changeset viewer.