Changeset 12319 for trunk/indexers/mgpp


Ignore:
Timestamp:
2006-07-28T12:04:37+12:00 (18 years ago)
Author:
kjdon
Message:

The inverted file dictionary was not ordered properly. ascii values were ordered case insensitive, e.g. Ant, ant, bee, Cat but non ascii values weren't. This means that xxx* doesn't work properly. So I have used unitool to do a proper unicode case insensitive ordering/matching

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mgpp/text/UCArray.cpp

    r8692 r12319  
    2222#include "UCArray.h"
    2323#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */
    24 
     24#include "unitool.h"
    2525
    2626void SetCStr (UCArray &text, const char *cStr) {
     
    5858  }
    5959  cstr[i]='\0';
     60  return cstr;
     61}
     62unsigned char * MyGetCStr(const UCArray& text) {
     63
     64  unsigned char *cstr = new unsigned char[text.size()+1];
     65  cstr[0] = text.size();
     66  UCArray::const_iterator here = text.begin();
     67  UCArray::const_iterator end = text.end();
     68
     69  int i = 1;
     70  while (here != end) {
     71    cstr[i] = text[i-1];
     72    ++here; ++i;
     73  }
     74  //  cstr[i]='\0';
    6075  return cstr;
    6176}
     
    198213 * based upon ascii character sequences.
    199214 */
    200 static unsigned char casecharmap[] = {
    201     '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
    202     '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
    203     '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
    204     '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
    205     '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
    206     '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
    207     '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
    208     '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
    209     '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
    210     '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
    211     '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
    212     '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
    213     '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
    214     '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
    215     '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
    216     '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
    217     '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
    218     '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
    219     '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
    220     '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
    221     '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
    222     '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
    223     '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
    224     '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
    225     '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
    226     '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
    227     '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
    228     '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
    229     '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
    230     '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
    231     '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
    232     '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
    233 };
    234 
     215// static unsigned char casecharmap[] = {
     216//  '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
     217//  '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
     218//  '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
     219//  '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
     220//  '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
     221//  '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
     222//  '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
     223//  '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
     224//  '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
     225//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
     226//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
     227//  '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
     228//  '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
     229//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
     230//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
     231//  '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
     232//  '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
     233//  '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
     234//  '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
     235//  '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
     236//  '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
     237//  '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
     238//  '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
     239//  '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
     240//  '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
     241//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
     242//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
     243//  '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
     244//  '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
     245//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
     246//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
     247//  '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
     248// };
    235249
    236250int DictCompare (const UCArray &a1, const UCArray &a2) {
    237   unsigned int l1 = a1.size();
    238   unsigned int l2 = a2.size();
    239   unsigned int l = (l1 < l2) ? l1 : l2;
     251  unsigned short a1_out[256]; /* temp space */
     252  unsigned short a2_out[256]; /* temp space */
     253
     254  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
     255  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
     256 
     257  /* decode the words to unicode */
     258  utf8_word_to_unicode (a1_str, a1_out, 255);
     259  utf8_word_to_unicode (a2_str, a2_out, 255);
     260
     261  int l1 = a1_out[0];
     262  int l2 = a2_out[0];
     263
     264  int len = (l1 < l2) ? l1 : l2;
    240265  int pos = 0;
    241   register int diff = 0;
    242 
    243   UCArray::const_iterator a1Here = a1.begin();
    244   UCArray::const_iterator a2Here = a2.begin();
    245  
    246   while (l--) {
    247     if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
     266  int diff = 0;
     267  for (int i=1; i<=len; ++i) {
     268    if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
     269     unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
    248270      return diff;
    249     if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
     271    }
     272    if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
    250273      pos = diff;
    251 
    252     ++a1Here;
    253     ++a2Here;
    254   }
    255 
     274    }
     275  }
    256276  return ((l1 - l2) ? (l1 - l2) : (pos));
    257 }
     277 
     278}
     279
     280// int DictCompare (const UCArray &a1, const UCArray &a2) {
     281//   unsigned int l1 = a1.size();
     282//   unsigned int l2 = a2.size();
     283//   unsigned int l = (l1 < l2) ? l1 : l2;
     284//   int pos = 0;
     285//   register int diff = 0;
     286
     287//   UCArray::const_iterator a1Here = a1.begin();
     288//   UCArray::const_iterator a2Here = a2.begin();
     289 
     290//   while (l--) {
     291//     if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
     292//       return diff;
     293//     if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
     294//       pos = diff;
     295
     296//     ++a1Here;
     297//     ++a2Here;
     298//   }
     299
     300//   return ((l1 - l2) ? (l1 - l2) : (pos));
     301// }
    258302
    259303// does the first string start with the second?
     
    287331    return false;
    288332  }
    289   unsigned int l =l2;
    290   UCArray::const_iterator a1Here = a1.begin();
    291   UCArray::const_iterator a2Here = a2.begin();
    292  
    293   while (l--) {
    294     if (casecharmap[*a1Here] != casecharmap[*a2Here])
    295       return false;
    296     ++a1Here;
    297     ++a2Here;
     333  unsigned short a1_out[256]; /* temp space */
     334  unsigned short a2_out[256]; /* temp space */
     335  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
     336  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
     337 
     338  /* decode the words to unicode */
     339  utf8_word_to_unicode (a1_str, a1_out, 255);
     340  utf8_word_to_unicode (a2_str, a2_out, 255);
     341
     342  unsigned int len = a2_out[0];; 
     343  for (int i=1; i<=len; ++i) {
     344    if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
     345    unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
     346   
    298347  }
    299348  return true; // we have successfully matched the whole way
    300349   
    301350}
     351
     352// does the first string start with the second, ignoring case?
     353// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
     354//   unsigned int l1 = a1.size();
     355//   unsigned int l2 = a2.size();
     356//   if (l2 > l1) {
     357//     // if the prefix is longer than the string, it can't start with it
     358//     return false;
     359//   }
     360//   unsigned int l =l2;
     361//   UCArray::const_iterator a1Here = a1.begin();
     362//   UCArray::const_iterator a2Here = a2.begin();
     363 
     364//   while (l--) {
     365//     if (casecharmap[*a1Here] != casecharmap[*a2Here])
     366//       return false;
     367//     ++a1Here;
     368//     ++a2Here;
     369//   }
     370//   return true; // we have successfully matched the whole way
     371   
     372// }
    302373
    303374
Note: See TracChangeset for help on using the changeset viewer.