Changeset 25788 for main/trunk/greenstone2/perllib/cnseg.pm
- Timestamp:
- 2012-06-14T11:03:14+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/cnseg.pm
r16980 r25788 49 49 my ($in) = @_; 50 50 my ($c); 51 51 my ($cl); 52 my $len = length($in); 53 my $i = 0; 54 my $out = ""; 55 my $space = 1; # start doesn't need a space 56 while ($i < $len) { 57 $c = substr ($in, $i, 1); 58 $cl = ord($c); 59 if (($cl >= 0x2e80 && $cl <= 0xd7a3) || 60 ( $cl >= 0xf900 && $cl <= 0xfa6a)) { # main east asian codes 61 # currently c++ receptionist code can't handle these large numbers 62 # search terms need to be segmented the same way. Add these back 63 # in when fix up c++ 64 # ($cl >= 0x20000 && $cl <= 0x2a6d6) || # cjk unified ideographs ext B 65 # ($cl >= 0x2f800 && $cl <= 0x2fa1d)) { #cjk compatibility ideographs supplement 66 # CJK character 67 $out .= chr(0x200b) unless $space; 68 $out .= $c; 69 $out .= chr(0x200b); 70 $space = 1; 71 } else { 72 $out .=$c; 73 $space = 0; 74 } 75 $i++; 76 } 77 return $out; 78 } 79 80 sub segment_old { 81 my ($in) = @_; 82 my ($c); 52 83 my $uniin = &unicode::utf82unicode($in); 53 84 my $out = [];
Note:
See TracChangeset
for help on using the changeset viewer.