Changeset 16553


Ignore:
Timestamp:
2008-07-25T16:37:50+12:00 (13 years ago)
Author:
ak19
Message:

Added method check_is_utf8 that will return 1/true if the given string is utf8 and 0/false if it isn't. Does not modify the string parameter. Based on ensure_utf8.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/unicode.pm

    r15894 r16553  
    545545
    546546
     547# Returns true (1) if the given string is utf8 and false (0) if it isn't.
     548# Does not modify the string parameter.
     549sub check_is_utf8 {
     550    my $value=shift;
     551
     552    if (!defined($value)) {
     553    return 0; # not utf8 because it is undefined
     554    }
     555
     556    $value =~ m/^/g; # to set \G
     557    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
     558    my $highbytes=$1;
     559    # make sure this block of high bytes is utf-8
     560    $highbytes =~ /^/g; # set pos()
     561    while ($highbytes =~
     562        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
     563            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
     564            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
     565            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
     566            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
     567            )*([\x80-\xff])? !xg
     568        ) {
     569        my $badbyte=$1;
     570        if (defined $badbyte) { # not end of string
     571        return 0; # non-utf8 found
     572        }
     573    }
     574    }
     575   
     576    return 1;
     577}
     578
    547579sub substr
    548580{
Note: See TracChangeset for help on using the changeset viewer.