source: trunk/gsdl/lib/gsdlunicode.cpp@ 1927

Last change on this file since 1927 was 1927, checked in by sjboddie, 23 years ago

Fixed a bug in the C++ encoding support - 8 bit encodings like windows-1251
were being treated as 16 bit encodings in some places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\iostream>
39# include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41# include <iostream.h>
42# include <fstream.h>
43#else
44# include <iostream>
45# include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52 text_t out;
53
54 unsigned char thischar[MAXUTF8CHARLEN];
55 int i, charlen;
56
57 while (here != end) {
58 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
60 here++;
61 }
62
63 return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69 text_t out;
70 unsigned char *in_cstr = (unsigned char *)in.getcstr();
71 unsigned char *here = in_cstr;
72 unsigned char *end = in_cstr;
73
74 unsigned short unichar;
75 int charlen = 0;
76
77 // get the last valid character in the string
78 while (*end != '\0') end++;
79 end--;
80
81 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82 out.push_back(unichar);
83 here += charlen;
84 }
85
86 delete in_cstr;
87
88 return out;
89}
90
91
92
93utf8inconvertclass::utf8inconvertclass () {
94 utf8buflen = 0;
95}
96
97void utf8inconvertclass::reset () {
98 start = NULL;
99 len = 0;
100 utf8buflen=0;
101}
102
103void utf8inconvertclass::convert (text_t &output, status_t &status) {
104 output.clear();
105 output.reserve (len/3);
106
107 if (start == NULL || len == 0) {
108 if (utf8buflen == 0) status = finished;
109 else status = stopped;
110 return;
111 }
112
113 // don't want any funny sign conversions happening
114 unsigned char *here = (unsigned char *)start;
115 unsigned char *end = here+len-1;
116 unsigned short c;
117 size_t realcharlen;
118
119 size_t charlen = getutf8charlen ();
120 while (len > 0) {
121 if (charlen == 0) {
122 // start parsing a new character
123 utf8buflen = 0;
124
125 // fast common case
126 while (len > 3) {
127 realcharlen = parse_utf8_char (here, end, &c);
128 output.push_back (c);
129 here += realcharlen;
130 len -= realcharlen;
131 }
132
133 utf8buf[utf8buflen++] = *here;
134 ++here;
135 --len;
136 charlen = getutf8charlen ();
137
138 } else if (utf8buflen < charlen) {
139 // assumes charlen is always less than MAXUTF8CHARLEN
140 utf8buf[utf8buflen++] = *here;
141 ++here;
142 --len;
143 }
144
145 if (utf8buflen == charlen) {
146 // got a complete character
147 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
148 output.push_back (c);
149
150 // move any unparsed characters. If an error occurred some of
151 // the characters might be unused.
152 int i;
153 int diff = utf8buflen - realcharlen;
154 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
155 utf8buflen = diff;
156 charlen = getutf8charlen ();
157 }
158 }
159
160 start = (char *)here; // save current position
161
162 if (utf8buflen == 0) status = finished;
163 else status = stopped;
164}
165
166
167// returns the length that the current contents of the
168// utf8buf should be
169size_t utf8inconvertclass::getutf8charlen () {
170 if (utf8buflen == 0) return 0;
171
172 // one byte character
173 if (utf8buf[0] < 0x80) return 1;
174
175 // error, is not the start of a utf-8 character
176 if (utf8buf[0] < 0xc0) return 1;
177
178 // two bute character
179 if (utf8buf[0] < 0xe0) return 2;
180
181 // three byte character
182 if (utf8buf[0] < 0xf0) return 3;
183
184 // error, character too long for unicode
185 return 1;
186}
187
188
189void utf8outconvertclass::reset () {
190 input = NULL;
191 outs = NULL;
192 utf8buflen = 0;
193 utf8bufhere = 0;
194}
195
196// note that convert does not null-terminate the
197// output array of characters
198void utf8outconvertclass::convert (char *output, size_t maxlen,
199 size_t &len, status_t &status) {
200 if (input == NULL || output == NULL) {
201 if (utf8buflen == 0) status = finished;
202 else status = unfinished;
203 return;
204 }
205
206 // don't want any funny sign conversions happening
207 unsigned char *uoutput = (unsigned char *)output;
208 text_t::iterator textend = input->end();
209 len = 0;
210 while (len < maxlen) {
211 // empty the contents of the internal buffer
212 if (utf8buflen > 0) {
213 while (len < maxlen && utf8bufhere < utf8buflen) {
214 *uoutput = utf8buf[utf8bufhere];
215 uoutput++;
216 len++;
217 utf8bufhere++;
218 }
219
220 if (utf8bufhere == utf8buflen) {
221 utf8bufhere = 0;
222 utf8buflen = 0;
223 }
224 }
225
226 // fill up the buffer with the next character
227 if (utf8buflen == 0) {
228 if (texthere == textend) break; // finished!
229 if (!rzws || (*texthere != 0x200b))
230 utf8buflen = output_utf8_char (*texthere, utf8buf,
231 &utf8buf[MAXUTF8CHARLEN-1]);
232 texthere++;
233 utf8bufhere = 0;
234 }
235 }
236
237 if (texthere == textend && utf8buflen == 0) status = finished;
238 else status = unfinished;
239}
240
241
242
243
244
245
246mapdata_t::mapdata_t () {
247 int i;
248
249 // reset all the map ptrs to be NULL
250 for (i=0; i<256; i++) {
251 ptrs[i] = (unsigned short *)NULL;
252 }
253
254 // say nothing has been loaded
255 loaded = false;
256}
257
258
259mapconvert::mapconvert () {
260 absentc = 0;
261}
262
263// setmapfile will cause loadmapfile to be called when conversion is
264// needed
265bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
266 // check to see if the mapfile has been already loaded
267 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
268
269 unloadmapfile ();
270 mapfile = themapfile;
271 absentc = theabsentc;
272
273 return true;
274}
275
276
277
278// loadmapfile should be called before any conversion is done
279bool mapconvert::loadmapfile (const text_t &themapfile,
280 unsigned short theabsentc) {
281 FILE *mapfilein = (FILE *)NULL;
282
283 // check to see if the mapfile has been already loaded
284 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
285
286 unloadmapfile ();
287 mapfile = themapfile;
288 absentc = theabsentc;
289
290 // open the map file
291 char *cfilename = mapfile.getcstr();
292 if (cfilename == (char *)NULL) return false;
293 mapfilein = fopen(cfilename, "rb");
294 delete cfilename;
295
296 if (mapfilein == (FILE *)NULL) return false;
297
298 unsigned char c, n1, n2;
299 unsigned short *arrptr;
300 int i;
301 c = fgetc (mapfilein);
302 while (!feof (mapfilein)) {
303 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
304 // allocate a new array
305 arrptr = new unsigned short[256];
306 mapdata.ptrs[c] = arrptr;
307 } else arrptr = mapdata.ptrs[c];
308
309 // clear the array
310 for (i=0; i<256; i++) arrptr[i] = 0;
311
312 // read in this block
313 n1 = fgetc (mapfilein);
314 n2 = fgetc (mapfilein);
315 i=0;
316 while (!feof (mapfilein)) {
317 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
318
319 i++;
320 if (i >= 256) break;
321 n1 = fgetc (mapfilein);
322 n2 = fgetc (mapfilein);
323 }
324
325 c = fgetc (mapfilein);
326 }
327
328 mapdata.loaded = true;
329
330 return true;
331}
332
333void mapconvert::unloadmapfile () {
334 if (!mapdata.loaded) return;
335
336 int i;
337 for (i=0; i<256; i++) {
338 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
339 delete [] mapdata.ptrs[i];
340 mapdata.ptrs[i] = (unsigned short *)NULL;
341 }
342 }
343
344 mapdata.loaded = false;
345}
346
347
348unsigned short mapconvert::convert (unsigned short c) {
349 if (!mapdata.loaded) {
350 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
351 // do nothing, successfully loaded database
352 } else return absentc;
353 }
354
355 if (c == 0) return 0; // 0 always maps to 0...
356
357 unsigned short n1 = c >> 8;
358 unsigned short n2 = c & 0xff;
359
360 unsigned short *arrptr = mapdata.ptrs[n1];
361 if (arrptr == (unsigned short *)NULL) return absentc;
362
363 if (arrptr[n2] == 0) return absentc;
364 return arrptr[n2];
365}
366
367text_t mapconvert::convert (const text_t &instr) {
368 if (!mapdata.loaded) return absentc;
369
370 text_t outstr;
371 text_t::const_iterator here = instr.begin();
372 text_t::const_iterator end = instr.end();
373
374 while (here != end) {
375 outstr.push_back(this->convert(*here));
376 here++;
377 }
378
379 return outstr;
380}
381
382
383
384
385mapinconvertclass::mapinconvertclass () {
386 multibyte = 0;
387 mapbuflen = 0;
388}
389
390void mapinconvertclass::reset () {
391 start = NULL;
392 len = 0;
393 mapbuflen=0;
394}
395
396void mapinconvertclass::convert (text_t &output, status_t &status) {
397 output.clear();
398
399 if (start == NULL || len == 0) {
400 if (mapbuflen == 0) status = finished;
401 else status = stopped;
402 return;
403 }
404
405 // don't want any funny sign conversions happening
406 unsigned char *here = (unsigned char *)start;
407
408 size_t charlen = getmapcharlen ();
409 while (len > 0) {
410 if (charlen == 0) {
411 // start parsing a new character
412 mapbuflen = 0;
413 mapbuf[mapbuflen++] = *here;
414 ++here;
415 --len;
416 charlen = getmapcharlen ();
417
418 } else if (mapbuflen < charlen) {
419 // assumes charlen is always less than MAXMAPCHARLEN
420 mapbuf[mapbuflen++] = *here;
421 ++here;
422 --len;
423 }
424
425 if (mapbuflen == charlen) {
426 // got a complete character
427 if (charlen == 1) {
428 if (mapbuf[0] < 0x80) {
429 // ascii character
430 output.push_back (mapbuf[0]);
431 } else {
432 output.push_back (converter.convert((unsigned short)mapbuf[0]));
433 }
434
435 } else {
436 // two byte character
437 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
438 (unsigned short)mapbuf[1]));
439 }
440
441 mapbuflen = 0;
442 charlen = 0;
443 }
444 }
445
446 start = (char *)here; // save current position
447
448 if (mapbuflen == 0) status = finished;
449 else status = stopped;
450}
451
452
453
454mapoutconvertclass::mapoutconvertclass () {
455 multibyte = 0;
456 mapbuflen=0;
457 mapbufhere=0;
458}
459
460void mapoutconvertclass::reset () {
461 input = NULL;
462 outs = NULL;
463 mapbuflen = 0;
464 mapbufhere = 0;
465}
466
467// note that convert does not null-terminate the
468// output array of characters
469void mapoutconvertclass::convert (char *output, size_t maxlen,
470 size_t &len, status_t &status) {
471 unsigned short outc;
472
473 if (input == NULL || output == NULL) {
474 if (mapbuflen == 0) status = finished;
475 else status = unfinished;
476 return;
477 }
478
479 // don't want any funny sign conversions happening
480 unsigned char *uoutput = (unsigned char *)output;
481 text_t::iterator textend = input->end();
482 len = 0;
483 while (len < maxlen) {
484 // empty the contents of the internal buffer
485 if (mapbuflen > 0) {
486 while (len < maxlen && mapbufhere < mapbuflen) {
487 *uoutput = mapbuf[mapbufhere];
488 uoutput++;
489 len++;
490 mapbufhere++;
491 }
492
493 if (mapbufhere == mapbuflen) {
494 mapbufhere = 0;
495 mapbuflen = 0;
496 }
497 }
498
499 // fill up the buffer with the next character
500 if (mapbuflen == 0) {
501 if (texthere == textend) break; // finished!
502 if (!rzws || (*texthere != 0x200b)) {
503 if (*texthere < 0x80) {
504 mapbuf[0] = (unsigned char)*texthere;
505 mapbuflen = 1;
506 } else {
507 outc = converter.convert (*texthere);
508 if (multibyte) {
509 mapbuf[0] = (unsigned char)(outc >> 8);
510 mapbuf[1] = (unsigned char)(outc & 0xff);
511 mapbuflen = 2;
512 } else {
513 mapbuf[0] = outc;
514 mapbuflen = 1;
515 }
516 }
517 }
518
519 texthere++;
520 mapbufhere = 0;
521 }
522 }
523
524 if (texthere == textend && mapbuflen == 0) status = finished;
525 else status = unfinished;
526}
527
528
529bool simplemapconvert::loadmapfile (bool in) {
530 if (loaded) return true;
531 if (mapfile.empty()) return false;
532
533 char *cfilename = mapfile.getcstr();
534#ifdef GSDL_USE_IOS_H
535 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
536#else
537 ifstream mapfilein (cfilename, ios::in);
538#endif
539 delete cfilename;
540 if (!mapfilein) return false;
541
542 char cline[2048];
543 text_t line;
544
545 while (!mapfilein.eof()) {
546 mapfilein.getline (cline, 2048);
547 line.clear();
548 line.appendcstr (cline);
549 if (line.empty()) continue;
550 // remove comments
551 text_t::iterator end = line.end();
552 text_t::iterator here = findchar (line.begin(), end, '#');
553 if (here != end) {
554 line.erase (here, end);
555 if (line.empty()) continue;
556 }
557
558 text_tarray parts;
559 splitchar (line.begin(), line.end(), '\t', parts);
560
561 // do some simple sanity checks
562 if (parts.size() < 2) continue;
563 text_t::iterator begin1 = parts[0].begin();
564 text_t::iterator begin2 = parts[1].begin();
565 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
566 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
567 char *from = parts[0].getcstr();
568 char *to = parts[1].getcstr();
569 unsigned int f = 0, t = 0;
570 sscanf (from, "%i", &f);
571 sscanf (to, "%i", &t);
572 delete from;
573 delete to;
574
575 if (in) mapping[(unsigned short)f] = (unsigned short)t;
576 else mapping[(unsigned short)t] = (unsigned short)f;
577 }
578
579 loaded = true;
580 return true;
581}
582
583unsigned short simplemapconvert::convert (unsigned short c, bool in) {
584
585 if (!loaded)
586 if (!loadmapfile(in)) return absentc;
587
588 return mapping[c];
589}
590
591
592void simplemapinconvertclass::convert (text_t &output, status_t &status) {
593 output.clear();
594
595 if (start == NULL || len == 0) {
596 status = finished;
597 return;
598 }
599
600 // don't want any funny sign conversions happening
601 unsigned char *here = (unsigned char *)start;
602 while (len > 0) {
603
604 if (*here < 0x80)
605 output.push_back (*here); // append this character
606 else
607 output.push_back (converter.convert(*here, true));
608
609 ++here;
610 --len;
611 }
612
613 start = (char *)here; // save current position
614 status = finished;
615}
616
617
618void simplemapoutconvertclass::convert (char *output, size_t maxlen,
619 size_t &len, status_t &status) {
620
621 if (input == NULL || output == NULL) {
622 status = finished;
623 return;
624 }
625
626 // don't want any funny sign conversions happening
627 unsigned char *uoutput = (unsigned char *)output;
628 text_t::iterator textend = input->end();
629 len = 0;
630 while ((len < maxlen) && (texthere != textend)) {
631
632 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
633 else *uoutput = converter.convert (*texthere, false);
634
635 ++uoutput;
636 ++len;
637 ++texthere;
638 }
639
640 if (texthere == textend) status = finished;
641 else status = unfinished;
642}
Note: See TracBrowser for help on using the repository browser.