Context Navigation

← Previous Changeset
Next Changeset →

Changeset 12319

Timestamp:

2006-07-28T12:04:37+12:00 (18 years ago)

Author:

kjdon

Message:

The inverted file dictionary was not ordered properly. ascii values were ordered case insensitive, e.g. Ant, ant, bee, Cat but non ascii values weren't. This means that xxx* doesn't work properly. So I have used unitool to do a proper unicode case insensitive ordering/matching

Location:

trunk

Files:

: 2 edited

indexers/mgpp/text/UCArray.cpp (modified) (4 diffs)
mgpp/text/UCArray.cpp (modified) (4 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/indexers/mgpp/text/UCArray.cpp

-              r8692
+              r12319
 #include "UCArray.h"
 #include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */
+#include "unitool.h"
 void SetCStr (UCArray &text, const char *cStr) {
 …
+  }
   cstr[i]='\0';
+  return cstr;
+}
+unsigned char * MyGetCStr(const UCArray& text) {
+  unsigned char *cstr = new unsigned char[text.size()+1];
+  cstr[0] = text.size();
+  UCArray::const_iterator here = text.begin();
+  UCArray::const_iterator end = text.end();
+  int i = 1;
+  while (here != end) {
+    cstr[i] = text[i-1];
+    ++here; ++i;
+  }
+  //  cstr[i]='\0';
   return cstr;
+}
 …
  * based upon ascii character sequences.
  */
+static unsigned char casecharmap[] = {
+    '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+    '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+    '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+    '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+    '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+    '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+    '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+    '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+    '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+    '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+    '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+    '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+    '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+    '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+    '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+    '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+    '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+    '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+    '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+    '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+    '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+    '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+    '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+    '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+    '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+    '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+    '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+    '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
+    '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+    '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+    '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+    '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+};
+// static unsigned char casecharmap[] = {
+//  '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+//  '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+//  '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+//  '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+//  '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+//  '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+//  '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+//  '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+//  '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+//  '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+//  '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+//  '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+//  '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+//  '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+//  '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+//  '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+//  '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+//  '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+//  '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+//  '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+//  '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+//  '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
+//  '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+//  '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+// };
 int DictCompare (const UCArray &a1, const UCArray &a2) {
+  unsigned int l1 = a1.size();
+  unsigned int l2 = a2.size();
+  unsigned int l = (l1 < l2) ? l1 : l2;
+  unsigned short a1_out[256]; /* temp space */
+  unsigned short a2_out[256]; /* temp space */
+  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
+  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
+  /* decode the words to unicode */
+  utf8_word_to_unicode (a1_str, a1_out, 255);
+  utf8_word_to_unicode (a2_str, a2_out, 255);
+  int l1 = a1_out[0];
+  int l2 = a2_out[0];
+  int len = (l1 < l2) ? l1 : l2;
   int pos = 0;
+  register int diff = 0;
+  UCArray::const_iterator a1Here = a1.begin();
+  UCArray::const_iterator a2Here = a2.begin();
+  while (l--) {
+    if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
+  int diff = 0;
+  for (int i=1; i<=len; ++i) {
+    if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
+     unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
       return diff;
+    if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
+    }
+    if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
       pos = diff;
+    ++a1Here;
+    ++a2Here;
+  }
+    }
+  }
   return ((l1 - l2) ? (l1 - l2) : (pos));
+}
+}
+// int DictCompare (const UCArray &a1, const UCArray &a2) {
+//   unsigned int l1 = a1.size();
+//   unsigned int l2 = a2.size();
+//   unsigned int l = (l1 < l2) ? l1 : l2;
+//   int pos = 0;
+//   register int diff = 0;
+//   UCArray::const_iterator a1Here = a1.begin();
+//   UCArray::const_iterator a2Here = a2.begin();
+//   while (l--) {
+//     if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
+//       return diff;
+//     if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
+//       pos = diff;
+//     ++a1Here;
+//     ++a2Here;
+//   }
+//   return ((l1 - l2) ? (l1 - l2) : (pos));
+// }
 // does the first string start with the second?
 …
     return false;
+  }
+  unsigned int l =l2;
+  UCArray::const_iterator a1Here = a1.begin();
+  UCArray::const_iterator a2Here = a2.begin();
+  while (l--) {
+    if (casecharmap[*a1Here] != casecharmap[*a2Here])
+      return false;
+    ++a1Here;
+    ++a2Here;
+  unsigned short a1_out[256]; /* temp space */
+  unsigned short a2_out[256]; /* temp space */
+  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
+  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
+  /* decode the words to unicode */
+  utf8_word_to_unicode (a1_str, a1_out, 255);
+  utf8_word_to_unicode (a2_str, a2_out, 255);
+  unsigned int len = a2_out[0];;
+  for (int i=1; i<=len; ++i) {
+    if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
+    unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
+  }
   return true; // we have successfully matched the whole way
+}
+// does the first string start with the second, ignoring case?
+// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
+//   unsigned int l1 = a1.size();
+//   unsigned int l2 = a2.size();
+//   if (l2 > l1) {
+//     // if the prefix is longer than the string, it can't start with it
+//     return false;
+//   }
+//   unsigned int l =l2;
+//   UCArray::const_iterator a1Here = a1.begin();
+//   UCArray::const_iterator a2Here = a2.begin();
+//   while (l--) {
+//     if (casecharmap[*a1Here] != casecharmap[*a2Here])
+//       return false;
+//     ++a1Here;
+//     ++a2Here;
+//   }
+//   return true; // we have successfully matched the whole way
+// }

trunk/mgpp/text/UCArray.cpp

-              r8692
+              r12319
 #include "UCArray.h"
 #include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */
+#include "unitool.h"
 void SetCStr (UCArray &text, const char *cStr) {
 …
+  }
   cstr[i]='\0';
+  return cstr;
+}
+unsigned char * MyGetCStr(const UCArray& text) {
+  unsigned char *cstr = new unsigned char[text.size()+1];
+  cstr[0] = text.size();
+  UCArray::const_iterator here = text.begin();
+  UCArray::const_iterator end = text.end();
+  int i = 1;
+  while (here != end) {
+    cstr[i] = text[i-1];
+    ++here; ++i;
+  }
+  //  cstr[i]='\0';
   return cstr;
+}
 …
  * based upon ascii character sequences.
  */
+static unsigned char casecharmap[] = {
+    '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+    '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+    '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+    '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+    '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+    '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+    '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+    '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+    '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+    '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+    '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+    '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+    '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+    '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+    '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+    '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+    '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+    '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+    '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+    '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+    '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+    '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+    '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+    '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+    '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+    '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+    '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+    '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
+    '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+    '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+    '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+    '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+};
+// static unsigned char casecharmap[] = {
+//  '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+//  '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+//  '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+//  '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+//  '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+//  '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+//  '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+//  '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+//  '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+//  '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+//  '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+//  '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+//  '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+//  '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+//  '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+//  '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+//  '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+//  '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+//  '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+//  '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+//  '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+//  '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+//  '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+//  '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
+//  '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+//  '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+//  '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+//  '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+// };
 int DictCompare (const UCArray &a1, const UCArray &a2) {
+  unsigned int l1 = a1.size();
+  unsigned int l2 = a2.size();
+  unsigned int l = (l1 < l2) ? l1 : l2;
+  unsigned short a1_out[256]; /* temp space */
+  unsigned short a2_out[256]; /* temp space */
+  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
+  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
+  /* decode the words to unicode */
+  utf8_word_to_unicode (a1_str, a1_out, 255);
+  utf8_word_to_unicode (a2_str, a2_out, 255);
+  int l1 = a1_out[0];
+  int l2 = a2_out[0];
+  int len = (l1 < l2) ? l1 : l2;
   int pos = 0;
+  register int diff = 0;
+  UCArray::const_iterator a1Here = a1.begin();
+  UCArray::const_iterator a2Here = a2.begin();
+  while (l--) {
+    if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
+  int diff = 0;
+  for (int i=1; i<=len; ++i) {
+    if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
+     unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
       return diff;
+    if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
+    }
+    if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
       pos = diff;
+    ++a1Here;
+    ++a2Here;
+  }
+    }
+  }
   return ((l1 - l2) ? (l1 - l2) : (pos));
+}
+}
+// int DictCompare (const UCArray &a1, const UCArray &a2) {
+//   unsigned int l1 = a1.size();
+//   unsigned int l2 = a2.size();
+//   unsigned int l = (l1 < l2) ? l1 : l2;
+//   int pos = 0;
+//   register int diff = 0;
+//   UCArray::const_iterator a1Here = a1.begin();
+//   UCArray::const_iterator a2Here = a2.begin();
+//   while (l--) {
+//     if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
+//       return diff;
+//     if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
+//       pos = diff;
+//     ++a1Here;
+//     ++a2Here;
+//   }
+//   return ((l1 - l2) ? (l1 - l2) : (pos));
+// }
 // does the first string start with the second?
 …
     return false;
+  }
+  unsigned int l =l2;
+  UCArray::const_iterator a1Here = a1.begin();
+  UCArray::const_iterator a2Here = a2.begin();
+  while (l--) {
+    if (casecharmap[*a1Here] != casecharmap[*a2Here])
+      return false;
+    ++a1Here;
+    ++a2Here;
+  unsigned short a1_out[256]; /* temp space */
+  unsigned short a2_out[256]; /* temp space */
+  unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
+  unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
+  /* decode the words to unicode */
+  utf8_word_to_unicode (a1_str, a1_out, 255);
+  utf8_word_to_unicode (a2_str, a2_out, 255);
+  unsigned int len = a2_out[0];;
+  for (int i=1; i<=len; ++i) {
+    if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
+    unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
+  }
   return true; // we have successfully matched the whole way
+}
+// does the first string start with the second, ignoring case?
+// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
+//   unsigned int l1 = a1.size();
+//   unsigned int l2 = a2.size();
+//   if (l2 > l1) {
+//     // if the prefix is longer than the string, it can't start with it
+//     return false;
+//   }
+//   unsigned int l =l2;
+//   UCArray::const_iterator a1Here = a1.begin();
+//   UCArray::const_iterator a2Here = a2.begin();
+//   while (l--) {
+//     if (casecharmap[*a1Here] != casecharmap[*a2Here])
+//       return false;
+//     ++a1Here;
+//     ++a2Here;
+//   }
+//   return true; // we have successfully matched the whole way
+// }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12319

Legend:

trunk/indexers/mgpp/text/UCArray.cpp

trunk/mgpp/text/UCArray.cpp

Download in other formats: