Context Navigation

UCArray.cpp@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago
merged 64_bit_Greenstone branch into trunk, rev 25139
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.2 KB

Line
1	/**************************************************************************
2	*
3	* UCArray.cpp -- vector based string class
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "UCArray.h"
23	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
24	#include "unitool.h"
25
26	void SetCStr (UCArray &text, const char *cStr) {
27	text.erase(text.begin(), text.end());
28
29	while (*cStr != '\0') {
30	text.push_back (*cStr);
31	++cStr;
32	}
33	}
34
35	void SetCStr (UCArray &text, const char *cStr, size_t nSizeHint) {
36	text.erase(text.begin(), text.end());
37
38	// reserve the needed space in advance
39	if (text.capacity() < nSizeHint + 1) {
40	text.reserve(nSizeHint + 1);
41	}
42	while (*cStr != '\0') {
43	text.push_back (*cStr);
44	++cStr;
45	}
46	}
47
48	/** char * returned becomes the responsibility of the caller */
49	char * GetCStr(const UCArray& text) {
50
51	char *cstr = new char[text.size()+1];
52	UCArray::const_iterator here = text.begin();
53	UCArray::const_iterator end = text.end();
54
55	int i = 0;
56	while (here != end) {
57	cstr[i] = text[i];
58	++here; ++i;
59	}
60	cstr[i]='\0';
61	return cstr;
62	}
63
64	/** char * returned becomes the responsibility of the caller */
65	unsigned char * MyGetCStr(const UCArray& text) {
66
67	unsigned char *cstr = new unsigned char[text.size()+1];
68	cstr[0] = text.size();
69	UCArray::const_iterator here = text.begin();
70	UCArray::const_iterator end = text.end();
71
72	int i = 1;
73	while (here != end) {
74	cstr[i] = text[i-1];
75	++here; ++i;
76	}
77	// cstr[i]='\0';
78	return cstr;
79	}
80
81	bool UCArrayCStrEquals(const UCArray &text, const unsigned char *cStr)
82	{
83	if ((cStr == NULL \|\| *cStr == '\0') && text.empty()) return true;
84	UCArray::const_iterator thisUC = text.begin();
85	UCArray::const_iterator endUC = text.end();
86	while (thisUC != endUC && *cStr != '\0') {
87	if (thisUC != cStr) return false;
88	++cStr; ++thisUC;
89	}
90	if (thisUC == endUC && *cStr == '\0') return true;
91	return false;
92	}
93
94	ostream &operator<<(ostream &s, const UCArray &a) {
95	UCArray::const_iterator here = a.begin();
96	UCArray::const_iterator end = a.end();
97	while (here != end) {
98	s << *here;
99	++here;
100	}
101
102	return s;
103	}
104
105
106	bool ReadVarLenUL (FILE *f, mg_u_long &n) {
107	register mg_u_long temp = 0;
108	register unsigned int bitPos = 0;
109	unsigned char b = 0;
110
111	do {
112	b = fgetc (f);
113	if (feof(f)) return false;
114	temp \|= (b & 0x7f) << bitPos;
115	bitPos += 7;
116	} while (b >= 0x80 && bitPos < 32);
117
118	n = temp;
119
120	return true;
121	}
122
123	bool WriteVarLenUL (FILE *f, mg_u_long n) {
124	register mg_u_long temp = n;
125	register unsigned char b = 0;
126	do {
127	b = static_cast<unsigned char> (temp & 0x7f);
128	if (temp >= 0x80) b \|= 0x80;
129	fputc (b, f);
130	if (ferror (f) != 0) return false;
131	} while ((temp = temp >> 7) > 0);
132
133	return true;
134	}
135
136
137	bool ReadUL (FILE *f, mg_u_long &n) {
138	if (fread (&n, sizeof (mg_u_long), 1, f) <= 0) return false;
139	NTOHUL (n);
140	return true;
141	}
142
143
144	bool WriteUL (FILE *f, mg_u_long n) {
145	HTONUL (n);
146	return (fwrite (&n, sizeof (mg_u_long), 1, f) > 0);
147	}
148
149	bool ReadF (FILE *f, float &n) {
150	if (fread (&n, sizeof (float), 1, f) <= 0) return false;
151	NTOHF(n);
152	return true;
153	}
154
155	bool WriteF (FILE *f, float n) {
156	HTONF(n);
157	return (fwrite (&n, sizeof (float), 1, f) > 0);
158	}
159
160	bool ReadD (FILE *f, double &n) {
161	if (fread (&n, sizeof (double), 1, f) <= 0) return false;
162	NTOHD(n);
163	return true;
164	}
165
166	bool WriteD (FILE *f, double n) {
167	HTOND(n);
168	return (fwrite (&n, sizeof (double), 1, f) > 0);
169	}
170
171	bool ReadUCArray (FILE *f, UCArray &a) {
172	// clear the array in preparation
173	a.erase (a.begin(), a.end());
174
175	// read in the array size
176	mg_u_long arraySize = 0;
177	if (!ReadVarLenUL (f, arraySize)) return false;
178
179	// reserve the needed space in advance
180	if (a.capacity() < arraySize + 1) {
181	a.reserve(arraySize + 1);
182	}
183
184	// read in the array
185	unsigned char b = 0;
186	while (arraySize > 0) {
187	b = fgetc (f);
188	if (feof(f)) return false;
189	a.push_back (b);
190
191	--arraySize;
192	}
193
194	return true;
195	}
196
197	bool WriteUCArray (FILE *f, const UCArray &a) {
198	// write out the array size
199	if (!WriteVarLenUL (f, a.size())) return false;
200
201	UCArray::const_iterator here = a.begin();
202	UCArray::const_iterator end = a.end();
203	while (here != end) {
204	fputc (*here, f);
205	if (ferror (f) != 0) return false;
206
207	++here;
208	}
209
210	return true;
211	}
212
213	/*
214	* This array is designed for mapping upper and lower case letter
215	* together for a case independent comparison. The mappings are
216	* based upon ascii character sequences.
217	*/
218	// static unsigned char casecharmap[] = {
219	// '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
220	// '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
221	// '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
222	// '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
223	// '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
224	// '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
225	// '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
226	// '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
227	// '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
228	// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
229	// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
230	// '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
231	// '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
232	// '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
233	// '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
234	// '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
235	// '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
236	// '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
237	// '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
238	// '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
239	// '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
240	// '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
241	// '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
242	// '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
243	// '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
244	// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
245	// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
246	// '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337',
247	// '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
248	// '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
249	// '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
250	// '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
251	// };
252
253	int DictCompare (const UCArray &a1, const UCArray &a2) {
254	unsigned short a1_out[256]; /* temp space */
255	unsigned short a2_out[256]; /* temp space */
256
257	unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
258	unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
259
260	/* decode the words to unicode */
261	utf8_word_to_unicode (a1_str, a1_out, 255);
262	utf8_word_to_unicode (a2_str, a2_out, 255);
263
264	delete []a1_str;
265	delete []a2_str;
266
267	int l1 = a1_out[0];
268	int l2 = a2_out[0];
269
270	int len = (l1 < l2) ? l1 : l2;
271	int pos = 0;
272	int diff = 0;
273	for (int i=1; i<=len; ++i) {
274	if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) -
275	unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) {
276	return diff;
277	}
278	if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) {
279	pos = diff;
280	}
281	}
282	return ((l1 - l2) ? (l1 - l2) : (pos));
283
284	}
285
286	// int DictCompare (const UCArray &a1, const UCArray &a2) {
287	// unsigned int l1 = a1.size();
288	// unsigned int l2 = a2.size();
289	// unsigned int l = (l1 < l2) ? l1 : l2;
290	// int pos = 0;
291	// register int diff = 0;
292
293	// UCArray::const_iterator a1Here = a1.begin();
294	// UCArray::const_iterator a2Here = a2.begin();
295
296	// while (l--) {
297	// if ((diff = casecharmap[a1Here] - casecharmap[a2Here]) != 0)
298	// return diff;
299	// if (pos == 0 && (diff = a1Here - a2Here) != 0)
300	// pos = diff;
301
302	// ++a1Here;
303	// ++a2Here;
304	// }
305
306	// return ((l1 - l2) ? (l1 - l2) : (pos));
307	// }
308
309	// does the first string start with the second?
310	bool StartsWith (const UCArray &a1, const UCArray &a2) {
311	unsigned int l1 = a1.size();
312	unsigned int l2 = a2.size();
313	if (l2 > l1) {
314	// if the prefix is longer than the string, it can't start with it
315	return false;
316	}
317	unsigned int l =l2;
318	UCArray::const_iterator a1Here = a1.begin();
319	UCArray::const_iterator a2Here = a2.begin();
320
321	while (l--) {
322	if ((a1Here != a2Here))
323	return false;
324	++a1Here;
325	++a2Here;
326	}
327	return true; // we have successfully matched the whole way
328
329	}
330
331	// does the first string start with the second, ignoring case?
332	bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
333	unsigned int l1 = a1.size();
334	unsigned int l2 = a2.size();
335	if (l2 > l1) {
336	// if the prefix is longer than the string, it can't start with it
337	return false;
338	}
339	unsigned short a1_out[256]; /* temp space */
340	unsigned short a2_out[256]; /* temp space */
341	unsigned char * a1_str = (unsigned char *)MyGetCStr(a1);
342	unsigned char * a2_str = (unsigned char *)MyGetCStr(a2);
343
344	/* decode the words to unicode */
345	utf8_word_to_unicode (a1_str, a1_out, 255);
346	utf8_word_to_unicode (a2_str, a2_out, 255);
347
348	delete []a1_str;
349	delete []a2_str;
350
351	unsigned int len = a2_out[0];;
352	for (int i=1; i<=len; ++i) {
353	if (unicode_tosimplified(unicode_tolower(a1_out[i])) !=
354	unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false;
355
356	}
357	return true; // we have successfully matched the whole way
358
359	}
360
361	// does the first string start with the second, ignoring case?
362	// bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
363	// unsigned int l1 = a1.size();
364	// unsigned int l2 = a2.size();
365	// if (l2 > l1) {
366	// // if the prefix is longer than the string, it can't start with it
367	// return false;
368	// }
369	// unsigned int l =l2;
370	// UCArray::const_iterator a1Here = a1.begin();
371	// UCArray::const_iterator a2Here = a2.begin();
372
373	// while (l--) {
374	// if (casecharmap[a1Here] != casecharmap[a2Here])
375	// return false;
376	// ++a1Here;
377	// ++a2Here;
378	// }
379	// return true; // we have successfully matched the whole way
380
381	// }
382
383
384	mg_u_long PrefixLen (const UCArray &a1, const UCArray &a2) {
385	mg_u_long l = (a1.size() < a2.size()) ? a1.size() : a2.size();
386	mg_u_long i = 0;
387
388	UCArray::const_iterator a1Here = a1.begin();
389	UCArray::const_iterator a2Here = a2.begin();
390
391	while (i < l && a1Here == a2Here) {
392	++i; ++a1Here; ++a2Here;
393	}
394
395	return i;
396	}
397
398	bool WritePreSufStr (FILE f, const UCArray prev, const UCArray &a) {
399	unsigned char preLen;
400	unsigned char sufLen;
401
402	if (prev != NULL) preLen = PrefixLen (*prev, a);
403	else preLen = 0;
404	sufLen = a.size() - preLen;
405
406	// output the prefix length, suffix length, and the suffix
407	fputc (preLen, f);
408	if (ferror(f) != 0) return false;
409	fputc (sufLen, f);
410	if (ferror(f) != 0) return false;
411	char* tmp=GetCStr(a);
412	int ret=(fwrite (tmp+preLen, sizeof (char), sufLen, f) == sufLen);
413	delete []tmp;
414	return (ret != 0);
415	}
416
417	// a also used for prev
418	bool ReadPreSufStr (FILE *f, UCArray &a) {
419	unsigned char preLen = 0;
420	unsigned char sufLen = 0;
421
422	preLen = fgetc(f);
423	sufLen = fgetc(f);
424
425	if (a.size() > preLen) a.erase (a.begin()+preLen, a.end());
426
427	// reserve the needed space in advance
428	if (a.capacity() < a.size() + sufLen + 1) {
429	a.reserve(a.size() + sufLen + 1);
430	}
431
432	while (sufLen > 0) {
433	unsigned char c = fgetc (f);
434	a.push_back (c);
435	--sufLen;
436	}
437
438	return true;
439	}
440

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/UCArray.cpp@ 25147

Download in other formats: