source: main/tags/2.51-jcdl/gsdl/lib/gsdlunicode.cpp@ 25200

Last change on this file since 25200 was 3667, checked in by jrm21, 21 years ago

added a utf16 output converter class.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\iostream>
39# include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41# include <iostream.h>
42# include <fstream.h>
43#else
44# include <iostream>
45# include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52 text_t out;
53
54 unsigned char thischar[MAXUTF8CHARLEN];
55 int i, charlen;
56
57 while (here != end) {
58 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
60 here++;
61 }
62
63 return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69 text_t out;
70 unsigned char *in_cstr = (unsigned char *)in.getcstr();
71 unsigned char *here = in_cstr;
72 unsigned char *end = in_cstr;
73
74 unsigned short unichar;
75 int charlen = 0;
76
77 // get the last valid character in the string
78 while (*end != '\0') end++;
79 end--;
80
81 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82 out.push_back(unichar);
83 here += charlen;
84 }
85
86 delete in_cstr;
87
88 return out;
89}
90
91
92// this works for all unicode values < 65536...
93void utf16outconvertclass::convert (char *out, size_t maxlen, size_t &len, status_t &status) {
94 // we should already have text_t* input set...
95 if (input == NULL || out == NULL)
96 {
97 status = finished;
98 return;
99 }
100 unsigned char *output = (unsigned char *)out;
101 text_t::iterator textend = input->end();
102 len = 0;
103 if (maxlen % 2) maxlen--; // we need an even number of output bytes...
104 while ((len < maxlen) && (texthere != textend)) {
105 unsigned short int uni_char=(unsigned short int) *texthere;
106 // big endian utf-16...
107 if (uni_char < 256) {
108 out[len]=0;
109 out[len+1]=uni_char;
110 } else {
111 out[len]=uni_char >> 8;
112 out[len+1]=uni_char & 255;
113 }
114 len+=2;
115 ++texthere;
116 }
117 if (texthere==textend)
118 status=finished;
119 else
120 status=unfinished;
121}
122
123
124utf8inconvertclass::utf8inconvertclass () {
125 utf8buflen = 0;
126}
127
128void utf8inconvertclass::reset () {
129 start = NULL;
130 len = 0;
131 utf8buflen=0;
132}
133
134void utf8inconvertclass::convert (text_t &output, status_t &status) {
135 output.clear();
136 output.reserve (len/3);
137
138 if (start == NULL || len == 0) {
139 if (utf8buflen == 0) status = finished;
140 else status = stopped;
141 return;
142 }
143
144 // don't want any funny sign conversions happening
145 unsigned char *here = (unsigned char *)start;
146 unsigned char *end = here+len-1;
147 unsigned short c;
148 size_t realcharlen;
149
150 size_t charlen = getutf8charlen ();
151 while (len > 0) {
152 if (charlen == 0) {
153 // start parsing a new character
154 utf8buflen = 0;
155
156 // fast common case
157 while (len > 3) {
158 realcharlen = parse_utf8_char (here, end, &c);
159 output.push_back (c);
160 here += realcharlen;
161 len -= realcharlen;
162 }
163
164 utf8buf[utf8buflen++] = *here;
165 ++here;
166 --len;
167 charlen = getutf8charlen ();
168
169 } else if (utf8buflen < charlen) {
170 // assumes charlen is always less than MAXUTF8CHARLEN
171 utf8buf[utf8buflen++] = *here;
172 ++here;
173 --len;
174 }
175
176 if (utf8buflen == charlen) {
177 // got a complete character
178 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
179 output.push_back (c);
180
181 // move any unparsed characters. If an error occurred some of
182 // the characters might be unused.
183 int i;
184 int diff = utf8buflen - realcharlen;
185 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
186 utf8buflen = diff;
187 charlen = getutf8charlen ();
188 }
189 }
190
191 start = (char *)here; // save current position
192
193 if (utf8buflen == 0) status = finished;
194 else status = stopped;
195}
196
197
198// returns the length that the current contents of the
199// utf8buf should be
200size_t utf8inconvertclass::getutf8charlen () {
201 if (utf8buflen == 0) return 0;
202
203 // one byte character
204 if (utf8buf[0] < 0x80) return 1;
205
206 // error, is not the start of a utf-8 character
207 if (utf8buf[0] < 0xc0) return 1;
208
209 // two bute character
210 if (utf8buf[0] < 0xe0) return 2;
211
212 // three byte character
213 if (utf8buf[0] < 0xf0) return 3;
214
215 // error, character too long for unicode
216 return 1;
217}
218
219
220void utf8outconvertclass::reset () {
221 input = NULL;
222 outs = NULL;
223 utf8buflen = 0;
224 utf8bufhere = 0;
225}
226
227// note that convert does not null-terminate the
228// output array of characters
229void utf8outconvertclass::convert (char *output, size_t maxlen,
230 size_t &len, status_t &status) {
231 if (input == NULL || output == NULL) {
232 if (utf8buflen == 0) status = finished;
233 else status = unfinished;
234 return;
235 }
236
237 // don't want any funny sign conversions happening
238 unsigned char *uoutput = (unsigned char *)output;
239 text_t::iterator textend = input->end();
240 len = 0;
241 while (len < maxlen) {
242 // empty the contents of the internal buffer
243 if (utf8buflen > 0) {
244 while (len < maxlen && utf8bufhere < utf8buflen) {
245 *uoutput = utf8buf[utf8bufhere];
246 uoutput++;
247 len++;
248 utf8bufhere++;
249 }
250
251 if (utf8bufhere == utf8buflen) {
252 utf8bufhere = 0;
253 utf8buflen = 0;
254 }
255 }
256
257 // fill up the buffer with the next character
258 if (utf8buflen == 0) {
259 if (texthere == textend) break; // finished!
260 if (!rzws || (*texthere != 0x200b))
261 utf8buflen = output_utf8_char (*texthere, utf8buf,
262 &utf8buf[MAXUTF8CHARLEN-1]);
263 texthere++;
264 utf8bufhere = 0;
265 }
266 }
267
268 if (texthere == textend && utf8buflen == 0) status = finished;
269 else status = unfinished;
270}
271
272
273
274
275
276
277mapdata_t::mapdata_t () {
278 int i;
279
280 // reset all the map ptrs to be NULL
281 for (i=0; i<256; i++) {
282 ptrs[i] = (unsigned short *)NULL;
283 }
284
285 // say nothing has been loaded
286 loaded = false;
287}
288
289
290mapconvert::mapconvert () {
291 absentc = 0;
292}
293
294// setmapfile will cause loadmapfile to be called when conversion is
295// needed
296bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
297 // check to see if the mapfile has been already loaded
298 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
299
300 unloadmapfile ();
301 mapfile = themapfile;
302 absentc = theabsentc;
303
304 return true;
305}
306
307
308
309// loadmapfile should be called before any conversion is done
310bool mapconvert::loadmapfile (const text_t &themapfile,
311 unsigned short theabsentc) {
312 FILE *mapfilein = (FILE *)NULL;
313
314 // check to see if the mapfile has been already loaded
315 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
316
317 unloadmapfile ();
318 mapfile = themapfile;
319 absentc = theabsentc;
320
321 // open the map file
322 char *cfilename = mapfile.getcstr();
323 if (cfilename == (char *)NULL) return false;
324 mapfilein = fopen(cfilename, "rb");
325 delete cfilename;
326
327 if (mapfilein == (FILE *)NULL) return false;
328
329 unsigned char c, n1, n2;
330 unsigned short *arrptr;
331 int i;
332 c = fgetc (mapfilein);
333 while (!feof (mapfilein)) {
334 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
335 // allocate a new array
336 arrptr = new unsigned short[256];
337 mapdata.ptrs[c] = arrptr;
338 } else arrptr = mapdata.ptrs[c];
339
340 // clear the array
341 for (i=0; i<256; i++) arrptr[i] = 0;
342
343 // read in this block
344 n1 = fgetc (mapfilein);
345 n2 = fgetc (mapfilein);
346 i=0;
347 while (!feof (mapfilein)) {
348 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
349
350 i++;
351 if (i >= 256) break;
352 n1 = fgetc (mapfilein);
353 n2 = fgetc (mapfilein);
354 }
355
356 c = fgetc (mapfilein);
357 }
358
359 mapdata.loaded = true;
360
361 return true;
362}
363
364void mapconvert::unloadmapfile () {
365 if (!mapdata.loaded) return;
366
367 int i;
368 for (i=0; i<256; i++) {
369 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
370 delete [] mapdata.ptrs[i];
371 mapdata.ptrs[i] = (unsigned short *)NULL;
372 }
373 }
374
375 mapdata.loaded = false;
376}
377
378
379unsigned short mapconvert::convert (unsigned short c) {
380 if (!mapdata.loaded) {
381 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
382 // do nothing, successfully loaded database
383 } else return absentc;
384 }
385
386 if (c == 0) return 0; // 0 always maps to 0...
387
388 unsigned short n1 = c >> 8;
389 unsigned short n2 = c & 0xff;
390
391 unsigned short *arrptr = mapdata.ptrs[n1];
392 if (arrptr == (unsigned short *)NULL) return absentc;
393
394 if (arrptr[n2] == 0) return absentc;
395 return arrptr[n2];
396}
397
398text_t mapconvert::convert (const text_t &instr) {
399 if (!mapdata.loaded) return absentc;
400
401 text_t outstr;
402 text_t::const_iterator here = instr.begin();
403 text_t::const_iterator end = instr.end();
404
405 while (here != end) {
406 outstr.push_back(this->convert(*here));
407 here++;
408 }
409
410 return outstr;
411}
412
413
414
415
416mapinconvertclass::mapinconvertclass () {
417 multibyte = 0;
418 mapbuflen = 0;
419}
420
421void mapinconvertclass::reset () {
422 start = NULL;
423 len = 0;
424 mapbuflen=0;
425}
426
427void mapinconvertclass::convert (text_t &output, status_t &status) {
428 output.clear();
429
430 if (start == NULL || len == 0) {
431 if (mapbuflen == 0) status = finished;
432 else status = stopped;
433 return;
434 }
435
436 // don't want any funny sign conversions happening
437 unsigned char *here = (unsigned char *)start;
438
439 size_t charlen = getmapcharlen ();
440 while (len > 0) {
441 if (charlen == 0) {
442 // start parsing a new character
443 mapbuflen = 0;
444 mapbuf[mapbuflen++] = *here;
445 ++here;
446 --len;
447 charlen = getmapcharlen ();
448
449 } else if (mapbuflen < charlen) {
450 // assumes charlen is always less than MAXMAPCHARLEN
451 mapbuf[mapbuflen++] = *here;
452 ++here;
453 --len;
454 }
455
456 if (mapbuflen == charlen) {
457 // got a complete character
458 if (charlen == 1) {
459 if (mapbuf[0] < 0x80) {
460 // ascii character
461 output.push_back (mapbuf[0]);
462 } else {
463 output.push_back (converter.convert((unsigned short)mapbuf[0]));
464 }
465
466 } else {
467 // two byte character
468 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
469 (unsigned short)mapbuf[1]));
470 }
471
472 mapbuflen = 0;
473 charlen = 0;
474 }
475 }
476
477 start = (char *)here; // save current position
478
479 if (mapbuflen == 0) status = finished;
480 else status = stopped;
481}
482
483
484
485mapoutconvertclass::mapoutconvertclass () {
486 multibyte = 0;
487 mapbuflen=0;
488 mapbufhere=0;
489}
490
491void mapoutconvertclass::reset () {
492 input = NULL;
493 outs = NULL;
494 mapbuflen = 0;
495 mapbufhere = 0;
496}
497
498// note that convert does not null-terminate the
499// output array of characters
500void mapoutconvertclass::convert (char *output, size_t maxlen,
501 size_t &len, status_t &status) {
502 unsigned short outc;
503
504 if (input == NULL || output == NULL) {
505 if (mapbuflen == 0) status = finished;
506 else status = unfinished;
507 return;
508 }
509
510 // don't want any funny sign conversions happening
511 unsigned char *uoutput = (unsigned char *)output;
512 text_t::iterator textend = input->end();
513 len = 0;
514 while (len < maxlen) {
515 // empty the contents of the internal buffer
516 if (mapbuflen > 0) {
517 while (len < maxlen && mapbufhere < mapbuflen) {
518 *uoutput = mapbuf[mapbufhere];
519 uoutput++;
520 len++;
521 mapbufhere++;
522 }
523
524 if (mapbufhere == mapbuflen) {
525 mapbufhere = 0;
526 mapbuflen = 0;
527 }
528 }
529
530 // fill up the buffer with the next character
531 if (mapbuflen == 0) {
532 if (texthere == textend) break; // finished!
533 if (!rzws || (*texthere != 0x200b)) {
534 if (*texthere < 0x80) {
535 mapbuf[0] = (unsigned char)*texthere;
536 mapbuflen = 1;
537 } else {
538 outc = converter.convert (*texthere);
539 if (multibyte) {
540 mapbuf[0] = (unsigned char)(outc >> 8);
541 mapbuf[1] = (unsigned char)(outc & 0xff);
542 mapbuflen = 2;
543 } else {
544 mapbuf[0] = outc;
545 mapbuflen = 1;
546 }
547 }
548 }
549
550 texthere++;
551 mapbufhere = 0;
552 }
553 }
554
555 if (texthere == textend && mapbuflen == 0) status = finished;
556 else status = unfinished;
557}
558
559
560bool simplemapconvert::loadmapfile (bool in) {
561 if (loaded) return true;
562 if (mapfile.empty()) return false;
563
564 char *cfilename = mapfile.getcstr();
565#ifdef GSDL_USE_IOS_H
566 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
567#else
568 ifstream mapfilein (cfilename, ios::in);
569#endif
570 delete cfilename;
571 if (!mapfilein) return false;
572
573 char cline[2048];
574 text_t line;
575
576 while (!mapfilein.eof()) {
577 mapfilein.getline (cline, 2048);
578 line.clear();
579 line.appendcstr (cline);
580 if (line.empty()) continue;
581 // remove comments
582 text_t::iterator end = line.end();
583 text_t::iterator here = findchar (line.begin(), end, '#');
584 if (here != end) {
585 line.erase (here, end);
586 if (line.empty()) continue;
587 }
588
589 text_tarray parts;
590 splitchar (line.begin(), line.end(), '\t', parts);
591
592 // do some simple sanity checks
593 if (parts.size() < 2) continue;
594 text_t::iterator begin1 = parts[0].begin();
595 text_t::iterator begin2 = parts[1].begin();
596 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
597 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
598 char *from = parts[0].getcstr();
599 char *to = parts[1].getcstr();
600 unsigned int f = 0, t = 0;
601 sscanf (from, "%i", &f);
602 sscanf (to, "%i", &t);
603 delete from;
604 delete to;
605
606 if (in) mapping[(unsigned short)f] = (unsigned short)t;
607 else mapping[(unsigned short)t] = (unsigned short)f;
608 }
609
610 loaded = true;
611 return true;
612}
613
614unsigned short simplemapconvert::convert (unsigned short c, bool in) {
615
616 if (!loaded)
617 if (!loadmapfile(in)) return absentc;
618
619 return mapping[c];
620}
621
622
623void simplemapinconvertclass::convert (text_t &output, status_t &status) {
624 output.clear();
625
626 if (start == NULL || len == 0) {
627 status = finished;
628 return;
629 }
630
631 // don't want any funny sign conversions happening
632 unsigned char *here = (unsigned char *)start;
633 while (len > 0) {
634
635 if (*here < 0x80)
636 output.push_back (*here); // append this character
637 else
638 output.push_back (converter.convert(*here, true));
639
640 ++here;
641 --len;
642 }
643
644 start = (char *)here; // save current position
645 status = finished;
646}
647
648
649void simplemapoutconvertclass::convert (char *output, size_t maxlen,
650 size_t &len, status_t &status) {
651
652 if (input == NULL || output == NULL) {
653 status = finished;
654 return;
655 }
656
657 // don't want any funny sign conversions happening
658 unsigned char *uoutput = (unsigned char *)output;
659 text_t::iterator textend = input->end();
660 len = 0;
661 while ((len < maxlen) && (texthere != textend)) {
662
663 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
664 else *uoutput = converter.convert (*texthere, false);
665
666 ++uoutput;
667 ++len;
668 ++texthere;
669 }
670
671 if (texthere == textend) status = finished;
672 else status = unfinished;
673}
Note: See TracBrowser for help on using the repository browser.