Changeset 16980

Show
Ignore:
Timestamp:
25.08.2008 09:58:13 (11 years ago)
Author:
kjdon
Message:

cjk character segmentation. text_t chars not big enough to handle numbers > 0xffff. have commented these ranges out in c++ and perl until we implement a better solution. these high ranges are only for extension sets anyway, so most common words will be segmented

Location:
gsdl/trunk
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/cnseg.pm

    r16641 r16980  
    5555    my $space = 1; # start doesn't need a space 
    5656    foreach $c (@$uniin) { 
    57     if (($c >= 0x2e80 && $c <= 0xfa6a) || # main east asian codes 
    58         ($c >= 0x20000 && $c <= 0x2a6d6) || # cjk unified ideographs ext B 
    59         ($c >= 0x2f800 && $c <= 0x2fa1d)) { #cjk compatibility ideographs supplement 
     57    if (($c >= 0x2e80 && $c <= 0xd7a3) || 
     58        ( $c >= 0xf900 && $c <= 0xfa6a)) { # main east asian codes 
     59        # currently c++ receptionist code can't handle these large numbers 
     60        # search terms need to be segmented the same way. Add these back 
     61        # in when fix up c++ 
     62       # ($c >= 0x20000 && $c <= 0x2a6d6) || # cjk unified ideographs ext B 
     63       # ($c >= 0x2f800 && $c <= 0x2fa1d)) { #cjk compatibility ideographs supplement 
    6064        # CJK character 
    6165        push (@$out, 0x200b) unless $space; 
  • gsdl/trunk/runtime-src/src/recpt/querytools.cpp

    r16645 r16980  
    305305      formattedstring.push_back(' '); 
    306306    } else if (segment) { 
    307       if ((*here >= 0x2e80 && *here <= 0xfa6a) || 
    308           (*here >= 0x20000 && *here <= 0x2a6d6) || 
    309       (*here >= 0x2f800 && *here <= 0x2fa1d)) { 
     307      if ((*here >= 0x2e80 && *here <= 0xd7a3) || 
     308      ( *here >= 0xf900 && *here <= 0xfa6a)) { 
     309    /* text_t not big enough to handle these. */ 
     310    /*    (*here >= 0x20000 && *here <= 0x2a6d6) || 
     311      (*here >= 0x2f800 && *here <= 0x2fa1d)) { */ 
    310312     
    311313    // CJK character