Changeset 12319
- Timestamp:
- 2006-07-28T12:04:37+12:00 (18 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/indexers/mgpp/text/UCArray.cpp
r8692 r12319 22 22 #include "UCArray.h" 23 23 #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ 24 24 #include "unitool.h" 25 25 26 26 void SetCStr (UCArray &text, const char *cStr) { … … 58 58 } 59 59 cstr[i]='\0'; 60 return cstr; 61 } 62 unsigned char * MyGetCStr(const UCArray& text) { 63 64 unsigned char *cstr = new unsigned char[text.size()+1]; 65 cstr[0] = text.size(); 66 UCArray::const_iterator here = text.begin(); 67 UCArray::const_iterator end = text.end(); 68 69 int i = 1; 70 while (here != end) { 71 cstr[i] = text[i-1]; 72 ++here; ++i; 73 } 74 // cstr[i]='\0'; 60 75 return cstr; 61 76 } … … 198 213 * based upon ascii character sequences. 199 214 */ 200 static unsigned char casecharmap[] = { 201 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 202 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 203 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 204 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 205 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 206 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 207 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 208 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 209 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 210 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 211 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 212 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 213 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 214 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 215 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 216 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 217 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 218 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 219 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 220 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 221 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 222 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 223 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 224 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 225 '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 226 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 227 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 228 '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', 229 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 230 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 231 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 232 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 233 }; 234 215 // static unsigned char casecharmap[] = { 216 // '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 217 // '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 218 // '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 219 // '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 220 // '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 221 // '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 222 // '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 223 // '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 224 // '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 225 // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 226 // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 227 // '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 228 // '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 229 // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 230 // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 231 // '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 232 // '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 233 // '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 234 // '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 235 // '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 236 // '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 237 // '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 238 // '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 239 // '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 240 // '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 241 // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 242 // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 243 // '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', 244 // '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 245 // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 246 // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 247 // '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 248 // }; 235 249 236 250 int DictCompare (const UCArray &a1, const UCArray &a2) { 237 unsigned int l1 = a1.size(); 238 unsigned int l2 = a2.size(); 239 unsigned int l = (l1 < l2) ? l1 : l2; 251 unsigned short a1_out[256]; /* temp space */ 252 unsigned short a2_out[256]; /* temp space */ 253 254 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); 255 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); 256 257 /* decode the words to unicode */ 258 utf8_word_to_unicode (a1_str, a1_out, 255); 259 utf8_word_to_unicode (a2_str, a2_out, 255); 260 261 int l1 = a1_out[0]; 262 int l2 = a2_out[0]; 263 264 int len = (l1 < l2) ? l1 : l2; 240 265 int pos = 0; 241 register int diff = 0; 242 243 UCArray::const_iterator a1Here = a1.begin(); 244 UCArray::const_iterator a2Here = a2.begin(); 245 246 while (l--) { 247 if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0) 266 int diff = 0; 267 for (int i=1; i<=len; ++i) { 268 if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) - 269 unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) { 248 270 return diff; 249 if (pos == 0 && (diff = *a1Here - *a2Here) != 0) 271 } 272 if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) { 250 273 pos = diff; 251 252 ++a1Here; 253 ++a2Here; 254 } 255 274 } 275 } 256 276 return ((l1 - l2) ? (l1 - l2) : (pos)); 257 } 277 278 } 279 280 // int DictCompare (const UCArray &a1, const UCArray &a2) { 281 // unsigned int l1 = a1.size(); 282 // unsigned int l2 = a2.size(); 283 // unsigned int l = (l1 < l2) ? l1 : l2; 284 // int pos = 0; 285 // register int diff = 0; 286 287 // UCArray::const_iterator a1Here = a1.begin(); 288 // UCArray::const_iterator a2Here = a2.begin(); 289 290 // while (l--) { 291 // if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0) 292 // return diff; 293 // if (pos == 0 && (diff = *a1Here - *a2Here) != 0) 294 // pos = diff; 295 296 // ++a1Here; 297 // ++a2Here; 298 // } 299 300 // return ((l1 - l2) ? (l1 - l2) : (pos)); 301 // } 258 302 259 303 // does the first string start with the second? … … 287 331 return false; 288 332 } 289 unsigned int l =l2; 290 UCArray::const_iterator a1Here = a1.begin(); 291 UCArray::const_iterator a2Here = a2.begin(); 292 293 while (l--) { 294 if (casecharmap[*a1Here] != casecharmap[*a2Here]) 295 return false; 296 ++a1Here; 297 ++a2Here; 333 unsigned short a1_out[256]; /* temp space */ 334 unsigned short a2_out[256]; /* temp space */ 335 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); 336 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); 337 338 /* decode the words to unicode */ 339 utf8_word_to_unicode (a1_str, a1_out, 255); 340 utf8_word_to_unicode (a2_str, a2_out, 255); 341 342 unsigned int len = a2_out[0];; 343 for (int i=1; i<=len; ++i) { 344 if (unicode_tosimplified(unicode_tolower(a1_out[i])) != 345 unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false; 346 298 347 } 299 348 return true; // we have successfully matched the whole way 300 349 301 350 } 351 352 // does the first string start with the second, ignoring case? 353 // bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) { 354 // unsigned int l1 = a1.size(); 355 // unsigned int l2 = a2.size(); 356 // if (l2 > l1) { 357 // // if the prefix is longer than the string, it can't start with it 358 // return false; 359 // } 360 // unsigned int l =l2; 361 // UCArray::const_iterator a1Here = a1.begin(); 362 // UCArray::const_iterator a2Here = a2.begin(); 363 364 // while (l--) { 365 // if (casecharmap[*a1Here] != casecharmap[*a2Here]) 366 // return false; 367 // ++a1Here; 368 // ++a2Here; 369 // } 370 // return true; // we have successfully matched the whole way 371 372 // } 302 373 303 374 -
trunk/mgpp/text/UCArray.cpp
r8692 r12319 22 22 #include "UCArray.h" 23 23 #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ 24 24 #include "unitool.h" 25 25 26 26 void SetCStr (UCArray &text, const char *cStr) { … … 58 58 } 59 59 cstr[i]='\0'; 60 return cstr; 61 } 62 unsigned char * MyGetCStr(const UCArray& text) { 63 64 unsigned char *cstr = new unsigned char[text.size()+1]; 65 cstr[0] = text.size(); 66 UCArray::const_iterator here = text.begin(); 67 UCArray::const_iterator end = text.end(); 68 69 int i = 1; 70 while (here != end) { 71 cstr[i] = text[i-1]; 72 ++here; ++i; 73 } 74 // cstr[i]='\0'; 60 75 return cstr; 61 76 } … … 198 213 * based upon ascii character sequences. 199 214 */ 200 static unsigned char casecharmap[] = { 201 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 202 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 203 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 204 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 205 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 206 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 207 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 208 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 209 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 210 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 211 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 212 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 213 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 214 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 215 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 216 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 217 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 218 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 219 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 220 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 221 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 222 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 223 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 224 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 225 '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 226 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 227 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 228 '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', 229 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 230 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 231 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 232 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 233 }; 234 215 // static unsigned char casecharmap[] = { 216 // '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', 217 // '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', 218 // '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', 219 // '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', 220 // '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', 221 // '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', 222 // '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', 223 // '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', 224 // '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 225 // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 226 // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 227 // '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', 228 // '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', 229 // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', 230 // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', 231 // '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', 232 // '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', 233 // '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', 234 // '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', 235 // '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', 236 // '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', 237 // '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', 238 // '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', 239 // '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', 240 // '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 241 // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 242 // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 243 // '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', 244 // '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', 245 // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', 246 // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', 247 // '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', 248 // }; 235 249 236 250 int DictCompare (const UCArray &a1, const UCArray &a2) { 237 unsigned int l1 = a1.size(); 238 unsigned int l2 = a2.size(); 239 unsigned int l = (l1 < l2) ? l1 : l2; 251 unsigned short a1_out[256]; /* temp space */ 252 unsigned short a2_out[256]; /* temp space */ 253 254 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); 255 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); 256 257 /* decode the words to unicode */ 258 utf8_word_to_unicode (a1_str, a1_out, 255); 259 utf8_word_to_unicode (a2_str, a2_out, 255); 260 261 int l1 = a1_out[0]; 262 int l2 = a2_out[0]; 263 264 int len = (l1 < l2) ? l1 : l2; 240 265 int pos = 0; 241 register int diff = 0; 242 243 UCArray::const_iterator a1Here = a1.begin(); 244 UCArray::const_iterator a2Here = a2.begin(); 245 246 while (l--) { 247 if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0) 266 int diff = 0; 267 for (int i=1; i<=len; ++i) { 268 if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) - 269 unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) { 248 270 return diff; 249 if (pos == 0 && (diff = *a1Here - *a2Here) != 0) 271 } 272 if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) { 250 273 pos = diff; 251 252 ++a1Here; 253 ++a2Here; 254 } 255 274 } 275 } 256 276 return ((l1 - l2) ? (l1 - l2) : (pos)); 257 } 277 278 } 279 280 // int DictCompare (const UCArray &a1, const UCArray &a2) { 281 // unsigned int l1 = a1.size(); 282 // unsigned int l2 = a2.size(); 283 // unsigned int l = (l1 < l2) ? l1 : l2; 284 // int pos = 0; 285 // register int diff = 0; 286 287 // UCArray::const_iterator a1Here = a1.begin(); 288 // UCArray::const_iterator a2Here = a2.begin(); 289 290 // while (l--) { 291 // if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0) 292 // return diff; 293 // if (pos == 0 && (diff = *a1Here - *a2Here) != 0) 294 // pos = diff; 295 296 // ++a1Here; 297 // ++a2Here; 298 // } 299 300 // return ((l1 - l2) ? (l1 - l2) : (pos)); 301 // } 258 302 259 303 // does the first string start with the second? … … 287 331 return false; 288 332 } 289 unsigned int l =l2; 290 UCArray::const_iterator a1Here = a1.begin(); 291 UCArray::const_iterator a2Here = a2.begin(); 292 293 while (l--) { 294 if (casecharmap[*a1Here] != casecharmap[*a2Here]) 295 return false; 296 ++a1Here; 297 ++a2Here; 333 unsigned short a1_out[256]; /* temp space */ 334 unsigned short a2_out[256]; /* temp space */ 335 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); 336 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); 337 338 /* decode the words to unicode */ 339 utf8_word_to_unicode (a1_str, a1_out, 255); 340 utf8_word_to_unicode (a2_str, a2_out, 255); 341 342 unsigned int len = a2_out[0];; 343 for (int i=1; i<=len; ++i) { 344 if (unicode_tosimplified(unicode_tolower(a1_out[i])) != 345 unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false; 346 298 347 } 299 348 return true; // we have successfully matched the whole way 300 349 301 350 } 351 352 // does the first string start with the second, ignoring case? 353 // bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) { 354 // unsigned int l1 = a1.size(); 355 // unsigned int l2 = a2.size(); 356 // if (l2 > l1) { 357 // // if the prefix is longer than the string, it can't start with it 358 // return false; 359 // } 360 // unsigned int l =l2; 361 // UCArray::const_iterator a1Here = a1.begin(); 362 // UCArray::const_iterator a2Here = a2.begin(); 363 364 // while (l--) { 365 // if (casecharmap[*a1Here] != casecharmap[*a2Here]) 366 // return false; 367 // ++a1Here; 368 // ++a2Here; 369 // } 370 // return true; // we have successfully matched the whole way 371 372 // } 302 373 303 374
Note:
See TracChangeset
for help on using the changeset viewer.