Changeset 7798 for trunk/gsdl


Ignore:
Timestamp:
2004-07-22T12:13:45+12:00 (20 years ago)
Author:
jrm21
Message:

added a function, unicode::ensure_utf8(), that will test that the given
string ref is encoded with utf-8, and will modify it if necessary.

add_utf8_metadata() makes use of this function. (Eg previously source filenames
could cause problems).

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/doc.pm

    r7569 r7798  
    788788}
    789789
    790 # add_utf8_metadata assumes the text has already been converted
    791 # to the UTF-8 encoding.
    792790sub add_utf8_metadata {
    793791    my $self = shift (@_);
     
    809807    }
    810808   
     809    # double check that the value is utf-8
     810    if (unicode::ensure_utf8(\$value)) {
     811    print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
     812    }
     813
    811814    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
    812815}
  • trunk/gsdl/perllib/unicode.pm

    r4229 r7798  
    66# University of Waikato, New Zealand.
    77#
    8 # Copyright (C) 1999 New Zealand Digital Library Project
     8# Copyright (C) 1999-2004 New Zealand Digital Library Project
    99#
    1010# This program is free software; you can redistribute it and/or modify
     
    468468}
    469469
     470
     471# this makes sure that the referenced input string is utf8 encoded, and
     472# will change/remove bytes that aren't.
     473# returns 0 if the text was already utf8, or 1 if text modified to become utf8
     474sub ensure_utf8 {
     475    my $stringref=shift;
     476
     477    my $value=$$stringref;
     478
     479    my $non_utf8_found = 0;
     480    $value =~ m/^/g; # to set \G
     481    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
     482    my $highbytes=$1;
     483    my $highbyteslength=length($highbytes);
     484    # make sure this block of high bytes is utf-8
     485    $highbytes =~ /^/g; # set pos()
     486    my $byte_replaced = 0;
     487    while ($highbytes =~
     488        m!\G (?: [\xc0-\xdf][\x80-\xbf]    | # 2 byte utf-8
     489            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
     490            [\xf0-\xf7][\x80-\xbf]{3}   # 4 byte
     491            [\xf8-\xfb][\x80-\xbf]{4}   # 5 byte
     492            [\xfc-\xfd][\x80-\xbf]{5}   # 6 byte
     493            )*([\x80-\xff])? !xg
     494        ) {
     495        # this highbyte is "out-of-place" for valid utf-8
     496        my $badbyte=$1;
     497        if (!defined $badbyte) {next} # hit end of string
     498        my $pos=pos($highbytes);
     499        # replace bad byte. assume iso-8859-1 -> utf-8
     500        # ascii2utf8 does "extended ascii"... ie iso-8859-1
     501        my $replacement=&unicode::ascii2utf8(\$badbyte);
     502        substr($highbytes, $pos-1, 1, $replacement);
     503        # update the position to continue searching (for \G)
     504        pos($highbytes) = $pos+length($replacement)-1;
     505        $byte_replaced = 1;
     506    }
     507    if ($byte_replaced) {
     508        # replace this block of high bytes in the $value
     509        $non_utf8_found = 1;
     510        my $replength=length($highbytes); # we've changed the length
     511        my $textpos=pos($value); # pos at end of last match
     512        # replace bad bytes with good bytes
     513        substr($value, $textpos-$highbyteslength,
     514                $highbyteslength, $highbytes);
     515        # update the position to continue searching (for \G)
     516        pos($value)=$textpos+($replength-$highbyteslength)+1;
     517    }
     518    }
     519
     520    $stringref = \$value;
     521    return $non_utf8_found;
     522}
     523
    4705241;
Note: See TracChangeset for help on using the changeset viewer.