Changeset 16553

Show
Ignore:
Timestamp:
25.07.2008 16:37:50 (11 years ago)
Author:
ak19
Message:

Added method check_is_utf8 that will return 1/true if the given string is utf8 and 0/false if it isn't. Does not modify the string parameter. Based on ensure_utf8.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/unicode.pm

    r15894 r16553  
    545545 
    546546 
     547# Returns true (1) if the given string is utf8 and false (0) if it isn't. 
     548# Does not modify the string parameter. 
     549sub check_is_utf8 { 
     550    my $value=shift; 
     551 
     552    if (!defined($value)) { 
     553    return 0; # not utf8 because it is undefined 
     554    } 
     555 
     556    $value =~ m/^/g; # to set \G 
     557    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) { 
     558    my $highbytes=$1; 
     559    # make sure this block of high bytes is utf-8 
     560    $highbytes =~ /^/g; # set pos() 
     561    while ($highbytes =~ 
     562        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8 
     563            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte 
     564            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte 
     565            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte 
     566            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte 
     567            )*([\x80-\xff])? !xg 
     568        ) { 
     569        my $badbyte=$1; 
     570        if (defined $badbyte) { # not end of string 
     571        return 0; # non-utf8 found 
     572        }  
     573    } 
     574    } 
     575     
     576    return 1; 
     577} 
     578 
    547579sub substr 
    548580{