source: main/tags/2.13/gsdl/lib/gsdlunicode.cpp@ 24552

Last change on this file since 24552 was 534, checked in by sjboddie, 25 years ago

added gpl notice

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.6 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.cpp 534 1999-09-07 04:57:43Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.10 1999/09/07 04:57:43 sjboddie
31 added gpl notice
32
33 Revision 1.9 1999/07/21 07:23:17 rjmcnab
34 Added setmapfile function to map conversion utilities so the map file
35 does not need to be loaded when map conversion object is created.
36
37 Revision 1.8 1999/07/01 04:03:45 rjmcnab
38 Optimised utf8inconvertclass::convert slightly.
39
40 Revision 1.7 1999/06/30 04:59:03 rjmcnab
41 Added a to_utf8 function that takes iterators as input.
42
43 Revision 1.6 1999/06/26 01:05:04 rjmcnab
44 No real changes.
45
46 Revision 1.5 1999/01/12 01:50:59 rjmcnab
47
48 Standard header.
49
50 Revision 1.4 1999/01/08 02:33:15 rjmcnab
51
52 Added standard header to source files.
53
54 */
55
56
57#include "gsdlunicode.h"
58
59
60// unitool is currently in mg, if mg is not being used it should
61// be moved into GSDLHOME/lib
62#include "unitool.h"
63
64#include "fileutil.h"
65
66#include <stdio.h>
67
68
69
70// converts a unicode encode text_t string to a utf-8
71// encoded text_t string
72text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
73 text_t out;
74
75 unsigned char thischar[MAXUTF8CHARLEN];
76 int i, charlen;
77
78 while (here != end) {
79 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
80 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
81 here++;
82 }
83
84 return out;
85}
86
87// converts a utf-8 encoded text_t string to a unicode
88// encoded text_t string
89text_t to_uni (const text_t &in) {
90 text_t out;
91 unsigned char *in_cstr = (unsigned char *)in.getcstr();
92 unsigned char *here = in_cstr;
93 unsigned char *end = in_cstr;
94
95 unsigned short unichar;
96 int charlen = 0;
97
98 // get the last valid character in the string
99 while (*end != '\0') end++;
100 end--;
101
102 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
103 out.push_back(unichar);
104 here += charlen;
105 }
106
107 delete in_cstr;
108
109 return out;
110}
111
112
113
114utf8inconvertclass::utf8inconvertclass () {
115 utf8buflen = 0;
116}
117
118void utf8inconvertclass::reset () {
119 start = NULL;
120 len = 0;
121 utf8buflen=0;
122}
123
124void utf8inconvertclass::convert (text_t &output, status_t &status) {
125 output.clear();
126 output.reserve (len/3);
127
128 if (start == NULL || len == 0) {
129 if (utf8buflen == 0) status = finished;
130 else status = stopped;
131 return;
132 }
133
134 // don't want any funny sign conversions happening
135 unsigned char *here = (unsigned char *)start;
136 unsigned char *end = here+len-1;
137 unsigned short c;
138 size_t realcharlen;
139
140 size_t charlen = getutf8charlen ();
141 while (len > 0) {
142 if (charlen == 0) {
143 // start parsing a new character
144 utf8buflen = 0;
145
146 // fast common case
147 while (len > 3) {
148 realcharlen = parse_utf8_char (here, end, &c);
149 output.push_back (c);
150 here += realcharlen;
151 len -= realcharlen;
152 }
153
154 utf8buf[utf8buflen++] = *here;
155 ++here;
156 --len;
157 charlen = getutf8charlen ();
158
159 } else if (utf8buflen < charlen) {
160 // assumes charlen is always less than MAXUTF8CHARLEN
161 utf8buf[utf8buflen++] = *here;
162 ++here;
163 --len;
164 }
165
166 if (utf8buflen == charlen) {
167 // got a complete character
168 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
169 output.push_back (c);
170
171 // move any unparsed characters. If an error occurred some of
172 // the characters might be unused.
173 int i;
174 int diff = utf8buflen - realcharlen;
175 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
176 utf8buflen = diff;
177 charlen = getutf8charlen ();
178 }
179 }
180
181 start = (char *)here; // save current position
182
183 if (utf8buflen == 0) status = finished;
184 else status = stopped;
185}
186
187
188// returns the length that the current contents of the
189// utf8buf should be
190size_t utf8inconvertclass::getutf8charlen () {
191 if (utf8buflen == 0) return 0;
192
193 // one byte character
194 if (utf8buf[0] < 0x80) return 1;
195
196 // error, is not the start of a utf-8 character
197 if (utf8buf[0] < 0xc0) return 1;
198
199 // two bute character
200 if (utf8buf[0] < 0xe0) return 2;
201
202 // three byte character
203 if (utf8buf[0] < 0xf0) return 3;
204
205 // error, character too long for unicode
206 return 1;
207}
208
209
210void utf8outconvertclass::reset () {
211 input = NULL;
212 outs = NULL;
213 utf8buflen = 0;
214 utf8bufhere = 0;
215}
216
217// note that convert does not null-terminate the
218// output array of characters
219void utf8outconvertclass::convert (char *output, size_t maxlen,
220 size_t &len, status_t &status) {
221 if (input == NULL || output == NULL) {
222 if (utf8buflen == 0) status = finished;
223 else status = unfinished;
224 return;
225 }
226
227 // don't want any funny sign conversions happening
228 unsigned char *uoutput = (unsigned char *)output;
229 text_t::iterator textend = input->end();
230 len = 0;
231 while (len < maxlen) {
232 // empty the contents of the internal buffer
233 if (utf8buflen > 0) {
234 while (len < maxlen && utf8bufhere < utf8buflen) {
235 *uoutput = utf8buf[utf8bufhere];
236 uoutput++;
237 len++;
238 utf8bufhere++;
239 }
240
241 if (utf8bufhere == utf8buflen) {
242 utf8bufhere = 0;
243 utf8buflen = 0;
244 }
245 }
246
247 // fill up the buffer with the next character
248 if (utf8buflen == 0) {
249 if (texthere == textend) break; // finished!
250 if (!rzws || (*texthere != 0x200b))
251 utf8buflen = output_utf8_char (*texthere, utf8buf,
252 &utf8buf[MAXUTF8CHARLEN-1]);
253 texthere++;
254 utf8bufhere = 0;
255 }
256 }
257
258 if (texthere == textend && utf8buflen == 0) status = finished;
259 else status = unfinished;
260}
261
262
263
264
265
266
267mapdata_t::mapdata_t () {
268 int i;
269
270 // reset all the map ptrs to be NULL
271 for (i=0; i<256; i++) {
272 ptrs[i] = (unsigned short *)NULL;
273 }
274
275 // say nothing has been loaded
276 loaded = false;
277}
278
279
280mapconvert::mapconvert () {
281 absentc = 0;
282}
283
284// setmapfile will cause loadmapfile to be called when conversion is
285// needed
286bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
287 unsigned short theabsentc) {
288 // check to see if the mapfile has been already loaded
289 if (mapdata.loaded && gsdlhome == thegsdlhome &&
290 encoding == theencoding && absentc == theabsentc)
291 return true;
292
293 unloadmapfile ();
294 gsdlhome = thegsdlhome;
295 encoding = theencoding;
296 absentc = theabsentc;
297
298 return true;
299}
300
301
302
303// loadmapfile should be called before any conversion is done
304bool mapconvert::loadmapfile (const text_t &thegsdlhome,
305 const text_t &theencoding,
306 unsigned short theabsentc) {
307 FILE *mapfilein = (FILE *)NULL;
308
309 // check to see if the mapfile has been already loaded
310 if (mapdata.loaded && gsdlhome == thegsdlhome &&
311 encoding == theencoding && absentc == theabsentc)
312 return true;
313
314 unloadmapfile ();
315 gsdlhome = thegsdlhome;
316 encoding = theencoding;
317 absentc = theabsentc;
318
319 // open the map file
320 text_t filename = filename_cat (gsdlhome, "unicode");
321 filename = filename_cat (filename, encoding);
322 filename += ".ump";
323 char *cfilename = filename.getcstr();
324 if (cfilename == (char *)NULL) return false;
325 mapfilein = fopen(cfilename, "rb");
326 delete cfilename;
327
328 if (mapfilein == (FILE *)NULL) return false;
329
330 unsigned char c, n1, n2;
331 unsigned short *arrptr;
332 int i;
333 c = fgetc (mapfilein);
334 while (!feof (mapfilein)) {
335 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
336 // allocate a new array
337 arrptr = new unsigned short[256];
338 mapdata.ptrs[c] = arrptr;
339 } else arrptr = mapdata.ptrs[c];
340
341 // clear the array
342 for (i=0; i<256; i++) arrptr[i] = 0;
343
344 // read in this block
345 n1 = fgetc (mapfilein);
346 n2 = fgetc (mapfilein);
347 i=0;
348 while (!feof (mapfilein)) {
349 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
350
351 i++;
352 if (i >= 256) break;
353 n1 = fgetc (mapfilein);
354 n2 = fgetc (mapfilein);
355 }
356
357 c = fgetc (mapfilein);
358 }
359
360 mapdata.loaded = true;
361
362 return true;
363}
364
365void mapconvert::unloadmapfile () {
366 if (!mapdata.loaded) return;
367
368 int i;
369 for (i=0; i<256; i++) {
370 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
371 delete [] mapdata.ptrs[i];
372 mapdata.ptrs[i] = (unsigned short *)NULL;
373 }
374 }
375
376 mapdata.loaded = false;
377}
378
379
380unsigned short mapconvert::convert (unsigned short c) {
381 if (!mapdata.loaded) {
382 if (!gsdlhome.empty() && !encoding.empty() &&
383 loadmapfile (gsdlhome, encoding, absentc)) {
384 // do nothing, successfully loaded database
385 } else return absentc;
386 }
387
388 if (c == 0) return 0; // 0 always maps to 0...
389
390 unsigned short n1 = c >> 8;
391 unsigned short n2 = c & 0xff;
392
393 unsigned short *arrptr = mapdata.ptrs[n1];
394 if (arrptr == (unsigned short *)NULL) return absentc;
395
396 if (arrptr[n2] == 0) return absentc;
397 return arrptr[n2];
398}
399
400text_t mapconvert::convert (const text_t &instr) {
401 if (!mapdata.loaded) return absentc;
402
403 text_t outstr;
404 text_t::const_iterator here = instr.begin();
405 text_t::const_iterator end = instr.end();
406
407 while (here != end) {
408 outstr.push_back(this->convert(*here));
409 here++;
410 }
411
412 return outstr;
413}
414
415
416
417
418mapinconvertclass::mapinconvertclass () {
419 mapbuflen = 0;
420}
421
422void mapinconvertclass::reset () {
423 start = NULL;
424 len = 0;
425 mapbuflen=0;
426}
427
428void mapinconvertclass::convert (text_t &output, status_t &status) {
429 output.clear();
430
431 if (start == NULL || len == 0) {
432 if (mapbuflen == 0) status = finished;
433 else status = stopped;
434 return;
435 }
436
437 // don't want any funny sign conversions happening
438 unsigned char *here = (unsigned char *)start;
439
440 size_t charlen = getmapcharlen ();
441 while (len > 0) {
442 if (charlen == 0) {
443 // start parsing a new character
444 mapbuflen = 0;
445 mapbuf[mapbuflen++] = *here;
446 ++here;
447 --len;
448 charlen = getmapcharlen ();
449
450 } else if (mapbuflen < charlen) {
451 // assumes charlen is always less than MAXMAPCHARLEN
452 mapbuf[mapbuflen++] = *here;
453 ++here;
454 --len;
455 }
456
457 if (mapbuflen == charlen) {
458 // got a complete character
459 if (charlen == 1) {
460 // ascii character
461 output.push_back (mapbuf[0]);
462
463 } else {
464 // two byte character
465 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
466 (unsigned short)mapbuf[1]));
467 }
468
469 mapbuflen = 0;
470 charlen = 0;
471 }
472 }
473
474 start = (char *)here; // save current position
475
476 if (mapbuflen == 0) status = finished;
477 else status = stopped;
478}
479
480
481
482mapoutconvertclass::mapoutconvertclass () {
483 mapbuflen=0;
484 mapbufhere=0;
485}
486
487void mapoutconvertclass::reset () {
488 input = NULL;
489 outs = NULL;
490 mapbuflen = 0;
491 mapbufhere = 0;
492}
493
494// note that convert does not null-terminate the
495// output array of characters
496void mapoutconvertclass::convert (char *output, size_t maxlen,
497 size_t &len, status_t &status) {
498 unsigned short outc;
499
500 if (input == NULL || output == NULL) {
501 if (mapbuflen == 0) status = finished;
502 else status = unfinished;
503 return;
504 }
505
506 // don't want any funny sign conversions happening
507 unsigned char *uoutput = (unsigned char *)output;
508 text_t::iterator textend = input->end();
509 len = 0;
510 while (len < maxlen) {
511 // empty the contents of the internal buffer
512 if (mapbuflen > 0) {
513 while (len < maxlen && mapbufhere < mapbuflen) {
514 *uoutput = mapbuf[mapbufhere];
515 uoutput++;
516 len++;
517 mapbufhere++;
518 }
519
520 if (mapbufhere == mapbuflen) {
521 mapbufhere = 0;
522 mapbuflen = 0;
523 }
524 }
525
526 // fill up the buffer with the next character
527 if (mapbuflen == 0) {
528 if (texthere == textend) break; // finished!
529 if (!rzws || (*texthere != 0x200b)) {
530 if (*texthere < 0x80) {
531 mapbuf[0] = (unsigned char)*texthere;
532 mapbuflen = 1;
533 } else {
534 outc = converter.convert (*texthere);
535 mapbuf[0] = (unsigned char)(outc >> 8);
536 mapbuf[1] = (unsigned char)(outc & 0xff);
537 mapbuflen = 2;
538 }
539 }
540
541 texthere++;
542 mapbufhere = 0;
543 }
544 }
545
546 if (texthere == textend && mapbuflen == 0) status = finished;
547 else status = unfinished;
548}
Note: See TracBrowser for help on using the repository browser.