Context Navigation

source: trunk/indexers/mgpp/text/UCArray.cpp@ 12321

Last change on this file since 12321 was 12319, checked in by kjdon, 18 years ago
The inverted file dictionary was not ordered properly. ascii values were ordered case insensitive, e.g. Ant, ant, bee, Cat but non ascii values weren't. This means that xxx* doesn't work properly. So I have used unitool to do a proper unicode case insensitive ordering/matching
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.0 KB

Line
1	/**************************************************************************
2	*
3	* UCArray.cpp -- vector based string class
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "UCArray.h"
23	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
24	#include "unitool.h"
25
26	void SetCStr (UCArray &text, const char *cStr) {
27	text.erase(text.begin(), text.end());
28
29	while (*cStr != '\0') {
30	text.push_back (*cStr);
31	++cStr;
32	}
33	}
34
35	void SetCStr (UCArray &text, const char *cStr, size_t nSizeHint) {
36	text.erase(text.begin(), text.end());
37
38	// reserve the needed space in advance
39	if (text.capacity() < nSizeHint + 1) {
40	text.reserve(nSizeHint + 1);
41	}
42	while (*cStr != '\0') {
43	text.push_back (*cStr);
44	++cStr;
45	}
46	}
47
48	char * GetCStr(const UCArray& text) {
49
50	char *cstr = new char[text.size()+1];
51	UCArray::const_iterator here = text.begin();
52	UCArray::const_iterator end = text.end();
53
54	int i = 0;
55	while (here != end) {
56	cstr[i] = text[i];
57	++here; ++i;
58	}
59	cstr[i]='\0';
60	return cstr;
61	}
62	unsigned char * MyGetCStr(const UCArray& text) {
63
64	unsigned char *cstr = new unsigned char[text.size()+1];
65	cstr[0] = text.size();
66	UCArray::const_iterator here = text.begin();
67	UCArray::const_iterator end = text.end();
68
69	int i = 1;
70	while (here != end) {
71	cstr[i] = text[i-1];
72	++here; ++i;
73	}
74	// cstr[i]='\0';
75	return cstr;
76	}
77
78	bool UCArrayCStrEquals(const UCArray &text, const unsigned char *cStr)
79	{
80	if ((cStr == NULL \|\| *cStr == '\0') && text.empty()) return true;
81	UCArray::const_iterator thisUC = text.begin();
82	UCArray::const_iterator endUC = text.end();
83	while (thisUC != endUC && *cStr != '\0') {
84	if (thisUC != cStr) return false;
85	++cStr; ++thisUC;
86	}
87	if (thisUC == endUC && *cStr == '\0') return true;
88	return false;
89	}
90
91	ostream &operator<<(ostream &s, const UCArray &a) {
92	UCArray::const_iterator here = a.begin();
93	UCArray::const_iterator end = a.end();
94	while (here != end) {
95	s << *here;
96	++here;
97	}
98
99	return s;
100	}
101
102
103	bool ReadVarLenUL (FILE *f, unsigned long &n) {
104	register unsigned long temp = 0;
105	register unsigned int bitPos = 0;
106	unsigned char b = 0;
107
108	do {
109	b = fgetc (f);
110	if (feof(f)) return false;
111	temp \|= (b & 0x7f) << bitPos;
112	bitPos += 7;
113	} while (b >= 0x80 && bitPos < 32);
114
115	n = temp;
116
117	return true;
118	}
119
120	bool WriteVarLenUL (FILE *f, unsigned long n) {
121	register unsigned long temp = n;
122	register unsigned char b = 0;
123	do {
124	b = static_cast<unsigned char> (temp & 0x7f);
125	if (temp >= 0x80) b \|= 0x80;
126	fputc (b, f);
127	if (ferror (f) != 0) return false;
128	} while ((temp = temp >> 7) > 0);
129
130	return true;
131	}
132
133
134	bool ReadUL (FILE *f, unsigned long &n) {
135	if (fread (&n, sizeof (unsigned long), 1, f) <= 0) return false;
136	NTOHUL (n);
137	return true;
138	}
139
140
141	bool WriteUL (FILE *f, unsigned long n) {
142	HTONUL (n);
143	return (fwrite (&n, sizeof (unsigned long), 1, f) > 0);
144	}
145
146	bool ReadF (FILE *f, float &n) {
147	if (fread (&n, sizeof (float), 1, f) <= 0) return false;
148	NTOHF(n);
149	return true;
150	}
151
152	bool WriteF (FILE *f, float n) {
153	HTONF(n);
154	return (fwrite (&n, sizeof (float), 1, f) > 0);
155	}
156
157	bool ReadD (FILE *f, double &n) {
158	if (fread (&n, sizeof (double), 1, f) <= 0) return false;
159	NTOHD(n);
160	return true;
161	}
162
163	bool WriteD (FILE *f, double n) {
164	HTOND(n);
165	return (fwrite (&n, sizeof (double), 1, f) > 0);
166	}
167
168	bool ReadUCArray (FILE *f, UCArray &a) {
169	// clear the array in preparation
170	a.erase (a.begin(), a.end());
171
172	// read in the array size
173	unsigned long arraySize = 0;
174	if (!ReadVarLenUL (f, arraySize)) return false;
175
176	// reserve the needed space in advance
177	if (a.capacity() < arraySize + 1) {
178	a.reserve(arraySize + 1);
179	}
180
181	// read in the array
182	unsigned char b = 0;
183	while (arraySize > 0) {
184	b = fgetc (f);
185	if (feof(f)) return false;
186	a.push_back (b);
187
188	--arraySize;
189	}
190
191	return true;
192	}
193
194	bool WriteUCArray (FILE *f, const UCArray &a) {
195	// write out the array size
196	if (!WriteVarLenUL (f, a.size())) return false;
197
198	UCArray::const_iterator here = a.begin();
199	UCArray::const_iterator end = a.end();
200	while (here != end) {
201	fputc (*here, f);
202	if (ferror (f) != 0) return false;
203
204	++here;
205	}
206
207	return true;
208	}
209
210	/*
211	* This array is designed for mapping upper and lower case letter
212	* together for a case independent comparison. The mappings are
213	* based upon ascii character sequences.
214	*/
215	// static unsigned char casecharmap[] = {
216	// '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
217	// '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
218	// '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
219	// '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
220	// '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
221	// '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
222	// '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
223	// '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
224	// '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
225	// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
226	// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
227	// '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
228	// '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
229	// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
230	// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
231	// '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
232	// '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
233	// '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
234	// '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
235	// '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
236	// '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
237	// '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
238	// '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
239	// '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
240	// '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
241	// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
242	// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
243	// '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
244	// '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
245	// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
246	// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
247	// '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
248	// };
249
250	int DictCompare (const UCArray &a1, const UCArray &a2) {
251	unsigned short a1_out[256]; /* temp space */
252	unsigned short a2_out[256]; /* temp space */
253
254	unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
255	unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
256
257	/* decode the words to unicode */
258	utf8_word_to_unicode (a1_str, a1_out, 255);
259	utf8_word_to_unicode (a2_str, a2_out, 255);
260
261	int l1 = a1_out[0];
262	int l2 = a2_out[0];
263
264	int len = (l1 < l2) ? l1 : l2;
265	int pos = 0;
266	int diff = 0;
267	for (int i=1; i<=len; ++i) {
268	if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
269	unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
270	return diff;
271	}
272	if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
273	pos = diff;
274	}
275	}
276	return ((l1 - l2) ? (l1 - l2) : (pos));
277
278	}
279
280	// int DictCompare (const UCArray &a1, const UCArray &a2) {
281	// unsigned int l1 = a1.size();
282	// unsigned int l2 = a2.size();
283	// unsigned int l = (l1 < l2) ? l1 : l2;
284	// int pos = 0;
285	// register int diff = 0;
286
287	// UCArray::const_iterator a1Here = a1.begin();
288	// UCArray::const_iterator a2Here = a2.begin();
289
290	// while (l--) {
291	// if ((diff = casecharmap[a1Here] - casecharmap[a2Here]) != 0)
292	// return diff;
293	// if (pos == 0 && (diff = a1Here - a2Here) != 0)
294	// pos = diff;
295
296	// ++a1Here;
297	// ++a2Here;
298	// }
299
300	// return ((l1 - l2) ? (l1 - l2) : (pos));
301	// }
302
303	// does the first string start with the second?
304	bool StartsWith (const UCArray &a1, const UCArray &a2) {
305	unsigned int l1 = a1.size();
306	unsigned int l2 = a2.size();
307	if (l2 > l1) {
308	// if the prefix is longer than the string, it can't start with it
309	return false;
310	}
311	unsigned int l =l2;
312	UCArray::const_iterator a1Here = a1.begin();
313	UCArray::const_iterator a2Here = a2.begin();
314
315	while (l--) {
316	if ((a1Here != a2Here))
317	return false;
318	++a1Here;
319	++a2Here;
320	}
321	return true; // we have successfully matched the whole way
322
323	}
324
325	// does the first string start with the second, ignoring case?
326	bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
327	unsigned int l1 = a1.size();
328	unsigned int l2 = a2.size();
329	if (l2 > l1) {
330	// if the prefix is longer than the string, it can't start with it
331	return false;
332	}
333	unsigned short a1_out[256]; /* temp space */
334	unsigned short a2_out[256]; /* temp space */
335	unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
336	unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
337
338	/* decode the words to unicode */
339	utf8_word_to_unicode (a1_str, a1_out, 255);
340	utf8_word_to_unicode (a2_str, a2_out, 255);
341
342	unsigned int len = a2_out[0];;
343	for (int i=1; i<=len; ++i) {
344	if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
345	unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
346
347	}
348	return true; // we have successfully matched the whole way
349
350	}
351
352	// does the first string start with the second, ignoring case?
353	// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
354	// unsigned int l1 = a1.size();
355	// unsigned int l2 = a2.size();
356	// if (l2 > l1) {
357	// // if the prefix is longer than the string, it can't start with it
358	// return false;
359	// }
360	// unsigned int l =l2;
361	// UCArray::const_iterator a1Here = a1.begin();
362	// UCArray::const_iterator a2Here = a2.begin();
363
364	// while (l--) {
365	// if (casecharmap[a1Here] != casecharmap[a2Here])
366	// return false;
367	// ++a1Here;
368	// ++a2Here;
369	// }
370	// return true; // we have successfully matched the whole way
371
372	// }
373
374
375	unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) {
376	unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size();
377	unsigned long i = 0;
378
379	UCArray::const_iterator a1Here = a1.begin();
380	UCArray::const_iterator a2Here = a2.begin();
381
382	while (i < l && a1Here == a2Here) {
383	++i; ++a1Here; ++a2Here;
384	}
385
386	return i;
387	}
388
389	bool WritePreSufStr (FILE f, const UCArray prev, const UCArray &a) {
390	unsigned char preLen;
391	unsigned char sufLen;
392
393	if (prev != NULL) preLen = PrefixLen (*prev, a);
394	else preLen = 0;
395	sufLen = a.size() - preLen;
396
397	// output the prefix length, suffix length, and the suffix
398	fputc (preLen, f);
399	if (ferror(f) != 0) return false;
400	fputc (sufLen, f);
401	if (ferror(f) != 0) return false;
402	char* tmp=GetCStr(a);
403	int ret=(fwrite (tmp+preLen, sizeof (char), sufLen, f) == sufLen);
404	delete []tmp;
405	return (ret != 0);
406	}
407
408	// a also used for prev
409	bool ReadPreSufStr (FILE *f, UCArray &a) {
410	unsigned char preLen = 0;
411	unsigned char sufLen = 0;
412
413	preLen = fgetc(f);
414	sufLen = fgetc(f);
415
416	if (a.size() > preLen) a.erase (a.begin()+preLen, a.end());
417
418	// reserve the needed space in advance
419	if (a.capacity() < a.size() + sufLen + 1) {
420	a.reserve(a.size() + sufLen + 1);
421	}
422
423	while (sufLen > 0) {
424	unsigned char c = fgetc (f);
425	a.push_back (c);
426	--sufLen;
427	}
428
429	return true;
430	}
431

Note: See TracBrowser for help on using the repository browser.

Download in other formats: