source: trunk/gsdl/lib/gsdlunicode.cpp@ 1870

Last change on this file since 1870 was 1870, checked in by sjboddie, 23 years ago

Tidied up language support stuff.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\iostream>
39# include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41# include <iostream.h>
42# include <fstream.h>
43#else
44# include <iostream>
45# include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52 text_t out;
53
54 unsigned char thischar[MAXUTF8CHARLEN];
55 int i, charlen;
56
57 while (here != end) {
58 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
60 here++;
61 }
62
63 return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69 text_t out;
70 unsigned char *in_cstr = (unsigned char *)in.getcstr();
71 unsigned char *here = in_cstr;
72 unsigned char *end = in_cstr;
73
74 unsigned short unichar;
75 int charlen = 0;
76
77 // get the last valid character in the string
78 while (*end != '\0') end++;
79 end--;
80
81 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82 out.push_back(unichar);
83 here += charlen;
84 }
85
86 delete in_cstr;
87
88 return out;
89}
90
91
92
93utf8inconvertclass::utf8inconvertclass () {
94 utf8buflen = 0;
95}
96
97void utf8inconvertclass::reset () {
98 start = NULL;
99 len = 0;
100 utf8buflen=0;
101}
102
103void utf8inconvertclass::convert (text_t &output, status_t &status) {
104 output.clear();
105 output.reserve (len/3);
106
107 if (start == NULL || len == 0) {
108 if (utf8buflen == 0) status = finished;
109 else status = stopped;
110 return;
111 }
112
113 // don't want any funny sign conversions happening
114 unsigned char *here = (unsigned char *)start;
115 unsigned char *end = here+len-1;
116 unsigned short c;
117 size_t realcharlen;
118
119 size_t charlen = getutf8charlen ();
120 while (len > 0) {
121 if (charlen == 0) {
122 // start parsing a new character
123 utf8buflen = 0;
124
125 // fast common case
126 while (len > 3) {
127 realcharlen = parse_utf8_char (here, end, &c);
128 output.push_back (c);
129 here += realcharlen;
130 len -= realcharlen;
131 }
132
133 utf8buf[utf8buflen++] = *here;
134 ++here;
135 --len;
136 charlen = getutf8charlen ();
137
138 } else if (utf8buflen < charlen) {
139 // assumes charlen is always less than MAXUTF8CHARLEN
140 utf8buf[utf8buflen++] = *here;
141 ++here;
142 --len;
143 }
144
145 if (utf8buflen == charlen) {
146 // got a complete character
147 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
148 output.push_back (c);
149
150 // move any unparsed characters. If an error occurred some of
151 // the characters might be unused.
152 int i;
153 int diff = utf8buflen - realcharlen;
154 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
155 utf8buflen = diff;
156 charlen = getutf8charlen ();
157 }
158 }
159
160 start = (char *)here; // save current position
161
162 if (utf8buflen == 0) status = finished;
163 else status = stopped;
164}
165
166
167// returns the length that the current contents of the
168// utf8buf should be
169size_t utf8inconvertclass::getutf8charlen () {
170 if (utf8buflen == 0) return 0;
171
172 // one byte character
173 if (utf8buf[0] < 0x80) return 1;
174
175 // error, is not the start of a utf-8 character
176 if (utf8buf[0] < 0xc0) return 1;
177
178 // two bute character
179 if (utf8buf[0] < 0xe0) return 2;
180
181 // three byte character
182 if (utf8buf[0] < 0xf0) return 3;
183
184 // error, character too long for unicode
185 return 1;
186}
187
188
189void utf8outconvertclass::reset () {
190 input = NULL;
191 outs = NULL;
192 utf8buflen = 0;
193 utf8bufhere = 0;
194}
195
196// note that convert does not null-terminate the
197// output array of characters
198void utf8outconvertclass::convert (char *output, size_t maxlen,
199 size_t &len, status_t &status) {
200 if (input == NULL || output == NULL) {
201 if (utf8buflen == 0) status = finished;
202 else status = unfinished;
203 return;
204 }
205
206 // don't want any funny sign conversions happening
207 unsigned char *uoutput = (unsigned char *)output;
208 text_t::iterator textend = input->end();
209 len = 0;
210 while (len < maxlen) {
211 // empty the contents of the internal buffer
212 if (utf8buflen > 0) {
213 while (len < maxlen && utf8bufhere < utf8buflen) {
214 *uoutput = utf8buf[utf8bufhere];
215 uoutput++;
216 len++;
217 utf8bufhere++;
218 }
219
220 if (utf8bufhere == utf8buflen) {
221 utf8bufhere = 0;
222 utf8buflen = 0;
223 }
224 }
225
226 // fill up the buffer with the next character
227 if (utf8buflen == 0) {
228 if (texthere == textend) break; // finished!
229 if (!rzws || (*texthere != 0x200b))
230 utf8buflen = output_utf8_char (*texthere, utf8buf,
231 &utf8buf[MAXUTF8CHARLEN-1]);
232 texthere++;
233 utf8bufhere = 0;
234 }
235 }
236
237 if (texthere == textend && utf8buflen == 0) status = finished;
238 else status = unfinished;
239}
240
241
242
243
244
245
246mapdata_t::mapdata_t () {
247 int i;
248
249 // reset all the map ptrs to be NULL
250 for (i=0; i<256; i++) {
251 ptrs[i] = (unsigned short *)NULL;
252 }
253
254 // say nothing has been loaded
255 loaded = false;
256}
257
258
259mapconvert::mapconvert () {
260 absentc = 0;
261}
262
263// setmapfile will cause loadmapfile to be called when conversion is
264// needed
265bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
266 // check to see if the mapfile has been already loaded
267 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
268
269 unloadmapfile ();
270 mapfile = themapfile;
271 absentc = theabsentc;
272
273 return true;
274}
275
276
277
278// loadmapfile should be called before any conversion is done
279bool mapconvert::loadmapfile (const text_t &themapfile,
280 unsigned short theabsentc) {
281 FILE *mapfilein = (FILE *)NULL;
282
283 // check to see if the mapfile has been already loaded
284 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
285
286 unloadmapfile ();
287 mapfile = themapfile;
288 absentc = theabsentc;
289
290 // open the map file
291 char *cfilename = mapfile.getcstr();
292 if (cfilename == (char *)NULL) return false;
293 mapfilein = fopen(cfilename, "rb");
294 delete cfilename;
295
296 if (mapfilein == (FILE *)NULL) return false;
297
298 unsigned char c, n1, n2;
299 unsigned short *arrptr;
300 int i;
301 c = fgetc (mapfilein);
302 while (!feof (mapfilein)) {
303 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
304 // allocate a new array
305 arrptr = new unsigned short[256];
306 mapdata.ptrs[c] = arrptr;
307 } else arrptr = mapdata.ptrs[c];
308
309 // clear the array
310 for (i=0; i<256; i++) arrptr[i] = 0;
311
312 // read in this block
313 n1 = fgetc (mapfilein);
314 n2 = fgetc (mapfilein);
315 i=0;
316 while (!feof (mapfilein)) {
317 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
318
319 i++;
320 if (i >= 256) break;
321 n1 = fgetc (mapfilein);
322 n2 = fgetc (mapfilein);
323 }
324
325 c = fgetc (mapfilein);
326 }
327
328 mapdata.loaded = true;
329
330 return true;
331}
332
333void mapconvert::unloadmapfile () {
334 if (!mapdata.loaded) return;
335
336 int i;
337 for (i=0; i<256; i++) {
338 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
339 delete [] mapdata.ptrs[i];
340 mapdata.ptrs[i] = (unsigned short *)NULL;
341 }
342 }
343
344 mapdata.loaded = false;
345}
346
347
348unsigned short mapconvert::convert (unsigned short c) {
349 if (!mapdata.loaded) {
350 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
351 // do nothing, successfully loaded database
352 } else return absentc;
353 }
354
355 if (c == 0) return 0; // 0 always maps to 0...
356
357 unsigned short n1 = c >> 8;
358 unsigned short n2 = c & 0xff;
359
360 unsigned short *arrptr = mapdata.ptrs[n1];
361 if (arrptr == (unsigned short *)NULL) return absentc;
362
363 if (arrptr[n2] == 0) return absentc;
364 return arrptr[n2];
365}
366
367text_t mapconvert::convert (const text_t &instr) {
368 if (!mapdata.loaded) return absentc;
369
370 text_t outstr;
371 text_t::const_iterator here = instr.begin();
372 text_t::const_iterator end = instr.end();
373
374 while (here != end) {
375 outstr.push_back(this->convert(*here));
376 here++;
377 }
378
379 return outstr;
380}
381
382
383
384
385mapinconvertclass::mapinconvertclass () {
386 mapbuflen = 0;
387}
388
389void mapinconvertclass::reset () {
390 start = NULL;
391 len = 0;
392 mapbuflen=0;
393}
394
395void mapinconvertclass::convert (text_t &output, status_t &status) {
396 output.clear();
397
398 if (start == NULL || len == 0) {
399 if (mapbuflen == 0) status = finished;
400 else status = stopped;
401 return;
402 }
403
404 // don't want any funny sign conversions happening
405 unsigned char *here = (unsigned char *)start;
406
407 size_t charlen = getmapcharlen ();
408 while (len > 0) {
409 if (charlen == 0) {
410 // start parsing a new character
411 mapbuflen = 0;
412 mapbuf[mapbuflen++] = *here;
413 ++here;
414 --len;
415 charlen = getmapcharlen ();
416
417 } else if (mapbuflen < charlen) {
418 // assumes charlen is always less than MAXMAPCHARLEN
419 mapbuf[mapbuflen++] = *here;
420 ++here;
421 --len;
422 }
423
424 if (mapbuflen == charlen) {
425 // got a complete character
426 if (charlen == 1) {
427 // ascii character
428 output.push_back (mapbuf[0]);
429
430 } else {
431 // two byte character
432 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
433 (unsigned short)mapbuf[1]));
434 }
435
436 mapbuflen = 0;
437 charlen = 0;
438 }
439 }
440
441 start = (char *)here; // save current position
442
443 if (mapbuflen == 0) status = finished;
444 else status = stopped;
445}
446
447
448
449mapoutconvertclass::mapoutconvertclass () {
450 mapbuflen=0;
451 mapbufhere=0;
452}
453
454void mapoutconvertclass::reset () {
455 input = NULL;
456 outs = NULL;
457 mapbuflen = 0;
458 mapbufhere = 0;
459}
460
461// note that convert does not null-terminate the
462// output array of characters
463void mapoutconvertclass::convert (char *output, size_t maxlen,
464 size_t &len, status_t &status) {
465 unsigned short outc;
466
467 if (input == NULL || output == NULL) {
468 if (mapbuflen == 0) status = finished;
469 else status = unfinished;
470 return;
471 }
472
473 // don't want any funny sign conversions happening
474 unsigned char *uoutput = (unsigned char *)output;
475 text_t::iterator textend = input->end();
476 len = 0;
477 while (len < maxlen) {
478 // empty the contents of the internal buffer
479 if (mapbuflen > 0) {
480 while (len < maxlen && mapbufhere < mapbuflen) {
481 *uoutput = mapbuf[mapbufhere];
482 uoutput++;
483 len++;
484 mapbufhere++;
485 }
486
487 if (mapbufhere == mapbuflen) {
488 mapbufhere = 0;
489 mapbuflen = 0;
490 }
491 }
492
493 // fill up the buffer with the next character
494 if (mapbuflen == 0) {
495 if (texthere == textend) break; // finished!
496 if (!rzws || (*texthere != 0x200b)) {
497 if (*texthere < 0x80) {
498 mapbuf[0] = (unsigned char)*texthere;
499 mapbuflen = 1;
500 } else {
501 outc = converter.convert (*texthere);
502 mapbuf[0] = (unsigned char)(outc >> 8);
503 mapbuf[1] = (unsigned char)(outc & 0xff);
504 mapbuflen = 2;
505 }
506 }
507
508 texthere++;
509 mapbufhere = 0;
510 }
511 }
512
513 if (texthere == textend && mapbuflen == 0) status = finished;
514 else status = unfinished;
515}
516
517
518bool simplemapconvert::loadmapfile (bool in) {
519 if (loaded) return true;
520 if (mapfile.empty()) return false;
521
522 char *cfilename = mapfile.getcstr();
523#ifdef GSDL_USE_IOS_H
524 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
525#else
526 ifstream mapfilein (cfilename, ios::in);
527#endif
528 delete cfilename;
529 if (!mapfilein) return false;
530
531 char cline[2048];
532 text_t line;
533
534 while (!mapfilein.eof()) {
535 mapfilein.getline (cline, 2048);
536 line.clear();
537 line.appendcstr (cline);
538 if (line.empty()) continue;
539 // remove comments
540 text_t::iterator end = line.end();
541 text_t::iterator here = findchar (line.begin(), end, '#');
542 if (here != end) {
543 line.erase (here, end);
544 if (line.empty()) continue;
545 }
546
547 text_tarray parts;
548 splitchar (line.begin(), line.end(), '\t', parts);
549
550 // do some simple sanity checks
551 if (parts.size() < 2) continue;
552 text_t::iterator begin1 = parts[0].begin();
553 text_t::iterator begin2 = parts[1].begin();
554 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
555 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
556 char *from = parts[0].getcstr();
557 char *to = parts[1].getcstr();
558 unsigned int f = 0, t = 0;
559 sscanf (from, "%i", &f);
560 sscanf (to, "%i", &t);
561 delete from;
562 delete to;
563
564 if (in) mapping[(unsigned short)f] = (unsigned short)t;
565 else mapping[(unsigned short)t] = (unsigned short)f;
566 }
567
568 loaded = true;
569 return true;
570}
571
572unsigned short simplemapconvert::convert (unsigned short c, bool in) {
573
574 if (!loaded)
575 if (!loadmapfile(in)) return absentc;
576
577 return mapping[c];
578}
579
580
581void simplemapinconvertclass::convert (text_t &output, status_t &status) {
582 output.clear();
583
584 if (start == NULL || len == 0) {
585 status = finished;
586 return;
587 }
588
589 // don't want any funny sign conversions happening
590 unsigned char *here = (unsigned char *)start;
591 while (len > 0) {
592
593 if (*here < 0x80)
594 output.push_back (*here); // append this character
595 else
596 output.push_back (converter.convert(*here, true));
597
598 ++here;
599 --len;
600 }
601
602 start = (char *)here; // save current position
603 status = finished;
604}
605
606
607void simplemapoutconvertclass::convert (char *output, size_t maxlen,
608 size_t &len, status_t &status) {
609
610 if (input == NULL || output == NULL) {
611 status = finished;
612 return;
613 }
614
615 // don't want any funny sign conversions happening
616 unsigned char *uoutput = (unsigned char *)output;
617 text_t::iterator textend = input->end();
618 len = 0;
619 while ((len < maxlen) && (texthere != textend)) {
620
621 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
622 else *uoutput = converter.convert (*texthere, false);
623
624 ++uoutput;
625 ++len;
626 ++texthere;
627 }
628
629 if (texthere == textend) status = finished;
630 else status = unfinished;
631}
Note: See TracBrowser for help on using the repository browser.