source: trunk/indexers/mgpp/text/UCArray.cpp@ 12321

Last change on this file since 12321 was 12319, checked in by kjdon, 18 years ago

The inverted file dictionary was not ordered properly. ascii values were ordered case insensitive, e.g. Ant, ant, bee, Cat but non ascii values weren't. This means that xxx* doesn't work properly. So I have used unitool to do a proper unicode case insensitive ordering/matching

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
Line 
1/**************************************************************************
2 *
3 * UCArray.cpp -- vector based string class
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "UCArray.h"
23#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
24#include "unitool.h"
25
26void SetCStr (UCArray &text, const char *cStr) {
27 text.erase(text.begin(), text.end());
28
29 while (*cStr != '\0') {
30 text.push_back (*cStr);
31 ++cStr;
32 }
33}
34
35void SetCStr (UCArray &text, const char *cStr, size_t nSizeHint) {
36 text.erase(text.begin(), text.end());
37
38 // reserve the needed space in advance
39 if (text.capacity() < nSizeHint + 1) {
40 text.reserve(nSizeHint + 1);
41 }
42 while (*cStr != '\0') {
43 text.push_back (*cStr);
44 ++cStr;
45 }
46}
47
48char * GetCStr(const UCArray& text) {
49
50 char *cstr = new char[text.size()+1];
51 UCArray::const_iterator here = text.begin();
52 UCArray::const_iterator end = text.end();
53
54 int i = 0;
55 while (here != end) {
56 cstr[i] = text[i];
57 ++here; ++i;
58 }
59 cstr[i]='\0';
60 return cstr;
61}
62unsigned char * MyGetCStr(const UCArray& text) {
63
64 unsigned char *cstr = new unsigned char[text.size()+1];
65 cstr[0] = text.size();
66 UCArray::const_iterator here = text.begin();
67 UCArray::const_iterator end = text.end();
68
69 int i = 1;
70 while (here != end) {
71 cstr[i] = text[i-1];
72 ++here; ++i;
73 }
74 // cstr[i]='\0';
75 return cstr;
76}
77
78bool UCArrayCStrEquals(const UCArray &text, const unsigned char *cStr)
79{
80 if ((cStr == NULL || *cStr == '\0') && text.empty()) return true;
81 UCArray::const_iterator thisUC = text.begin();
82 UCArray::const_iterator endUC = text.end();
83 while (thisUC != endUC && *cStr != '\0') {
84 if (*thisUC != *cStr) return false;
85 ++cStr; ++thisUC;
86 }
87 if (thisUC == endUC && *cStr == '\0') return true;
88 return false;
89}
90
91ostream &operator<<(ostream &s, const UCArray &a) {
92 UCArray::const_iterator here = a.begin();
93 UCArray::const_iterator end = a.end();
94 while (here != end) {
95 s << *here;
96 ++here;
97 }
98
99 return s;
100}
101
102
103bool ReadVarLenUL (FILE *f, unsigned long &n) {
104 register unsigned long temp = 0;
105 register unsigned int bitPos = 0;
106 unsigned char b = 0;
107
108 do {
109 b = fgetc (f);
110 if (feof(f)) return false;
111 temp |= (b & 0x7f) << bitPos;
112 bitPos += 7;
113 } while (b >= 0x80 && bitPos < 32);
114
115 n = temp;
116
117 return true;
118}
119
120bool WriteVarLenUL (FILE *f, unsigned long n) {
121 register unsigned long temp = n;
122 register unsigned char b = 0;
123 do {
124 b = static_cast<unsigned char> (temp & 0x7f);
125 if (temp >= 0x80) b |= 0x80;
126 fputc (b, f);
127 if (ferror (f) != 0) return false;
128 } while ((temp = temp >> 7) > 0);
129
130 return true;
131}
132
133
134bool ReadUL (FILE *f, unsigned long &n) {
135 if (fread (&n, sizeof (unsigned long), 1, f) <= 0) return false;
136 NTOHUL (n);
137 return true;
138}
139
140
141bool WriteUL (FILE *f, unsigned long n) {
142 HTONUL (n);
143 return (fwrite (&n, sizeof (unsigned long), 1, f) > 0);
144}
145
146bool ReadF (FILE *f, float &n) {
147 if (fread (&n, sizeof (float), 1, f) <= 0) return false;
148 NTOHF(n);
149 return true;
150}
151
152bool WriteF (FILE *f, float n) {
153 HTONF(n);
154 return (fwrite (&n, sizeof (float), 1, f) > 0);
155}
156
157bool ReadD (FILE *f, double &n) {
158 if (fread (&n, sizeof (double), 1, f) <= 0) return false;
159 NTOHD(n);
160 return true;
161}
162
163bool WriteD (FILE *f, double n) {
164 HTOND(n);
165 return (fwrite (&n, sizeof (double), 1, f) > 0);
166}
167
168bool ReadUCArray (FILE *f, UCArray &a) {
169 // clear the array in preparation
170 a.erase (a.begin(), a.end());
171
172 // read in the array size
173 unsigned long arraySize = 0;
174 if (!ReadVarLenUL (f, arraySize)) return false;
175
176 // reserve the needed space in advance
177 if (a.capacity() < arraySize + 1) {
178 a.reserve(arraySize + 1);
179 }
180
181 // read in the array
182 unsigned char b = 0;
183 while (arraySize > 0) {
184 b = fgetc (f);
185 if (feof(f)) return false;
186 a.push_back (b);
187
188 --arraySize;
189 }
190
191 return true;
192}
193
194bool WriteUCArray (FILE *f, const UCArray &a) {
195 // write out the array size
196 if (!WriteVarLenUL (f, a.size())) return false;
197
198 UCArray::const_iterator here = a.begin();
199 UCArray::const_iterator end = a.end();
200 while (here != end) {
201 fputc (*here, f);
202 if (ferror (f) != 0) return false;
203
204 ++here;
205 }
206
207 return true;
208}
209
210/*
211 * This array is designed for mapping upper and lower case letter
212 * together for a case independent comparison. The mappings are
213 * based upon ascii character sequences.
214 */
215// static unsigned char casecharmap[] = {
216// '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
217// '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
218// '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
219// '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
220// '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
221// '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
222// '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
223// '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
224// '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
225// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
226// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
227// '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
228// '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
229// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
230// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
231// '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
232// '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
233// '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
234// '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
235// '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
236// '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
237// '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
238// '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
239// '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
240// '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
241// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
242// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
243// '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
244// '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
245// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
246// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
247// '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
248// };
249
250int DictCompare (const UCArray &a1, const UCArray &a2) {
251 unsigned short a1_out[256]; /* temp space */
252 unsigned short a2_out[256]; /* temp space */
253
254 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
255 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
256
257 /* decode the words to unicode */
258 utf8_word_to_unicode (a1_str, a1_out, 255);
259 utf8_word_to_unicode (a2_str, a2_out, 255);
260
261 int l1 = a1_out[0];
262 int l2 = a2_out[0];
263
264 int len = (l1 < l2) ? l1 : l2;
265 int pos = 0;
266 int diff = 0;
267 for (int i=1; i<=len; ++i) {
268 if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
269 unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
270 return diff;
271 }
272 if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
273 pos = diff;
274 }
275 }
276 return ((l1 - l2) ? (l1 - l2) : (pos));
277
278}
279
280// int DictCompare (const UCArray &a1, const UCArray &a2) {
281// unsigned int l1 = a1.size();
282// unsigned int l2 = a2.size();
283// unsigned int l = (l1 < l2) ? l1 : l2;
284// int pos = 0;
285// register int diff = 0;
286
287// UCArray::const_iterator a1Here = a1.begin();
288// UCArray::const_iterator a2Here = a2.begin();
289
290// while (l--) {
291// if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0)
292// return diff;
293// if (pos == 0 && (diff = *a1Here - *a2Here) != 0)
294// pos = diff;
295
296// ++a1Here;
297// ++a2Here;
298// }
299
300// return ((l1 - l2) ? (l1 - l2) : (pos));
301// }
302
303// does the first string start with the second?
304bool StartsWith (const UCArray &a1, const UCArray &a2) {
305 unsigned int l1 = a1.size();
306 unsigned int l2 = a2.size();
307 if (l2 > l1) {
308 // if the prefix is longer than the string, it can't start with it
309 return false;
310 }
311 unsigned int l =l2;
312 UCArray::const_iterator a1Here = a1.begin();
313 UCArray::const_iterator a2Here = a2.begin();
314
315 while (l--) {
316 if ((*a1Here != *a2Here))
317 return false;
318 ++a1Here;
319 ++a2Here;
320 }
321 return true; // we have successfully matched the whole way
322
323}
324
325// does the first string start with the second, ignoring case?
326bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
327 unsigned int l1 = a1.size();
328 unsigned int l2 = a2.size();
329 if (l2 > l1) {
330 // if the prefix is longer than the string, it can't start with it
331 return false;
332 }
333 unsigned short a1_out[256]; /* temp space */
334 unsigned short a2_out[256]; /* temp space */
335 unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
336 unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
337
338 /* decode the words to unicode */
339 utf8_word_to_unicode (a1_str, a1_out, 255);
340 utf8_word_to_unicode (a2_str, a2_out, 255);
341
342 unsigned int len = a2_out[0];;
343 for (int i=1; i<=len; ++i) {
344 if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
345 unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
346
347 }
348 return true; // we have successfully matched the whole way
349
350}
351
352// does the first string start with the second, ignoring case?
353// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
354// unsigned int l1 = a1.size();
355// unsigned int l2 = a2.size();
356// if (l2 > l1) {
357// // if the prefix is longer than the string, it can't start with it
358// return false;
359// }
360// unsigned int l =l2;
361// UCArray::const_iterator a1Here = a1.begin();
362// UCArray::const_iterator a2Here = a2.begin();
363
364// while (l--) {
365// if (casecharmap[*a1Here] != casecharmap[*a2Here])
366// return false;
367// ++a1Here;
368// ++a2Here;
369// }
370// return true; // we have successfully matched the whole way
371
372// }
373
374
375unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) {
376 unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size();
377 unsigned long i = 0;
378
379 UCArray::const_iterator a1Here = a1.begin();
380 UCArray::const_iterator a2Here = a2.begin();
381
382 while (i < l && *a1Here == *a2Here) {
383 ++i; ++a1Here; ++a2Here;
384 }
385
386 return i;
387}
388
389bool WritePreSufStr (FILE *f, const UCArray *prev, const UCArray &a) {
390 unsigned char preLen;
391 unsigned char sufLen;
392
393 if (prev != NULL) preLen = PrefixLen (*prev, a);
394 else preLen = 0;
395 sufLen = a.size() - preLen;
396
397 // output the prefix length, suffix length, and the suffix
398 fputc (preLen, f);
399 if (ferror(f) != 0) return false;
400 fputc (sufLen, f);
401 if (ferror(f) != 0) return false;
402 char* tmp=GetCStr(a);
403 int ret=(fwrite (tmp+preLen, sizeof (char), sufLen, f) == sufLen);
404 delete []tmp;
405 return (ret != 0);
406}
407
408// a also used for prev
409bool ReadPreSufStr (FILE *f, UCArray &a) {
410 unsigned char preLen = 0;
411 unsigned char sufLen = 0;
412
413 preLen = fgetc(f);
414 sufLen = fgetc(f);
415
416 if (a.size() > preLen) a.erase (a.begin()+preLen, a.end());
417
418 // reserve the needed space in advance
419 if (a.capacity() < a.size() + sufLen + 1) {
420 a.reserve(a.size() + sufLen + 1);
421 }
422
423 while (sufLen > 0) {
424 unsigned char c = fgetc (f);
425 a.push_back (c);
426 --sufLen;
427 }
428
429 return true;
430}
431
Note: See TracBrowser for help on using the repository browser.