Ignore:
Timestamp:
2012-06-14T11:03:14+12:00 (12 years ago)
Author:
kjdon
Message:

segmentation code was assuming strings in utf8 but we have changed to using unicode aware strings, so no conversion needed.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/cnseg.pm

    r16980 r25788  
    4949    my ($in) = @_;
    5050    my ($c);
    51 
     51    my ($cl);
     52    my $len = length($in);
     53    my $i = 0;
     54    my $out = "";
     55    my $space = 1; # start doesn't need a space
     56    while ($i < $len) {
     57    $c = substr ($in, $i, 1);
     58    $cl = ord($c);
     59    if (($cl >= 0x2e80 && $cl <= 0xd7a3) ||
     60        ( $cl >= 0xf900 && $cl <= 0xfa6a)) { # main east asian codes
     61        # currently c++ receptionist code can't handle these large numbers
     62        # search terms need to be segmented the same way. Add these back
     63        # in when fix up c++
     64        # ($cl >= 0x20000 && $cl <= 0x2a6d6) || # cjk unified ideographs ext B
     65        # ($cl >= 0x2f800 && $cl <= 0x2fa1d)) { #cjk compatibility ideographs supplement
     66        # CJK character
     67        $out .= chr(0x200b) unless $space;
     68        $out .= $c;
     69        $out .= chr(0x200b);
     70        $space = 1;
     71    } else {
     72        $out .=$c;
     73        $space = 0;
     74    }
     75    $i++;
     76    }
     77    return $out;
     78}
     79   
     80sub segment_old {
     81    my ($in) = @_;
     82    my ($c);
    5283    my $uniin = &unicode::utf82unicode($in);
    5384    my $out = [];
Note: See TracChangeset for help on using the changeset viewer.