Changeset 25788

Show
Ignore:
Timestamp:
14.06.2012 11:03:14 (7 years ago)
Author:
kjdon
Message:

segmentation code was assuming strings in utf8 but we have changed to using unicode aware strings, so no conversion needed.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cnseg.pm

    r16980 r25788  
    4949    my ($in) = @_; 
    5050    my ($c); 
    51  
     51    my ($cl); 
     52    my $len = length($in); 
     53    my $i = 0; 
     54    my $out = ""; 
     55    my $space = 1; # start doesn't need a space 
     56    while ($i < $len) { 
     57    $c = substr ($in, $i, 1); 
     58    $cl = ord($c); 
     59    if (($cl >= 0x2e80 && $cl <= 0xd7a3) || 
     60        ( $cl >= 0xf900 && $cl <= 0xfa6a)) { # main east asian codes 
     61        # currently c++ receptionist code can't handle these large numbers 
     62        # search terms need to be segmented the same way. Add these back 
     63        # in when fix up c++ 
     64        # ($cl >= 0x20000 && $cl <= 0x2a6d6) || # cjk unified ideographs ext B 
     65        # ($cl >= 0x2f800 && $cl <= 0x2fa1d)) { #cjk compatibility ideographs supplement 
     66        # CJK character 
     67        $out .= chr(0x200b) unless $space; 
     68        $out .= $c; 
     69        $out .= chr(0x200b); 
     70        $space = 1; 
     71    } else { 
     72        $out .=$c; 
     73        $space = 0; 
     74    } 
     75    $i++; 
     76    } 
     77    return $out; 
     78} 
     79     
     80sub segment_old { 
     81    my ($in) = @_; 
     82    my ($c); 
    5283    my $uniin = &unicode::utf82unicode($in); 
    5384    my $out = [];