source: trunk/gsdl/lib/gsdlunicode.cpp@ 1817

Last change on this file since 1817 was 1310, checked in by sjboddie, 24 years ago

Removed CVS logging information from source files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.9 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\iostream>
39# include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41# include <iostream.h>
42# include <fstream.h>
43#else
44# include <iostream>
45# include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52 text_t out;
53
54 unsigned char thischar[MAXUTF8CHARLEN];
55 int i, charlen;
56
57 while (here != end) {
58 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
60 here++;
61 }
62
63 return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69 text_t out;
70 unsigned char *in_cstr = (unsigned char *)in.getcstr();
71 unsigned char *here = in_cstr;
72 unsigned char *end = in_cstr;
73
74 unsigned short unichar;
75 int charlen = 0;
76
77 // get the last valid character in the string
78 while (*end != '\0') end++;
79 end--;
80
81 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82 out.push_back(unichar);
83 here += charlen;
84 }
85
86 delete in_cstr;
87
88 return out;
89}
90
91
92
93utf8inconvertclass::utf8inconvertclass () {
94 utf8buflen = 0;
95}
96
97void utf8inconvertclass::reset () {
98 start = NULL;
99 len = 0;
100 utf8buflen=0;
101}
102
103void utf8inconvertclass::convert (text_t &output, status_t &status) {
104 output.clear();
105 output.reserve (len/3);
106
107 if (start == NULL || len == 0) {
108 if (utf8buflen == 0) status = finished;
109 else status = stopped;
110 return;
111 }
112
113 // don't want any funny sign conversions happening
114 unsigned char *here = (unsigned char *)start;
115 unsigned char *end = here+len-1;
116 unsigned short c;
117 size_t realcharlen;
118
119 size_t charlen = getutf8charlen ();
120 while (len > 0) {
121 if (charlen == 0) {
122 // start parsing a new character
123 utf8buflen = 0;
124
125 // fast common case
126 while (len > 3) {
127 realcharlen = parse_utf8_char (here, end, &c);
128 output.push_back (c);
129 here += realcharlen;
130 len -= realcharlen;
131 }
132
133 utf8buf[utf8buflen++] = *here;
134 ++here;
135 --len;
136 charlen = getutf8charlen ();
137
138 } else if (utf8buflen < charlen) {
139 // assumes charlen is always less than MAXUTF8CHARLEN
140 utf8buf[utf8buflen++] = *here;
141 ++here;
142 --len;
143 }
144
145 if (utf8buflen == charlen) {
146 // got a complete character
147 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
148 output.push_back (c);
149
150 // move any unparsed characters. If an error occurred some of
151 // the characters might be unused.
152 int i;
153 int diff = utf8buflen - realcharlen;
154 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
155 utf8buflen = diff;
156 charlen = getutf8charlen ();
157 }
158 }
159
160 start = (char *)here; // save current position
161
162 if (utf8buflen == 0) status = finished;
163 else status = stopped;
164}
165
166
167// returns the length that the current contents of the
168// utf8buf should be
169size_t utf8inconvertclass::getutf8charlen () {
170 if (utf8buflen == 0) return 0;
171
172 // one byte character
173 if (utf8buf[0] < 0x80) return 1;
174
175 // error, is not the start of a utf-8 character
176 if (utf8buf[0] < 0xc0) return 1;
177
178 // two bute character
179 if (utf8buf[0] < 0xe0) return 2;
180
181 // three byte character
182 if (utf8buf[0] < 0xf0) return 3;
183
184 // error, character too long for unicode
185 return 1;
186}
187
188
189void utf8outconvertclass::reset () {
190 input = NULL;
191 outs = NULL;
192 utf8buflen = 0;
193 utf8bufhere = 0;
194}
195
196// note that convert does not null-terminate the
197// output array of characters
198void utf8outconvertclass::convert (char *output, size_t maxlen,
199 size_t &len, status_t &status) {
200 if (input == NULL || output == NULL) {
201 if (utf8buflen == 0) status = finished;
202 else status = unfinished;
203 return;
204 }
205
206 // don't want any funny sign conversions happening
207 unsigned char *uoutput = (unsigned char *)output;
208 text_t::iterator textend = input->end();
209 len = 0;
210 while (len < maxlen) {
211 // empty the contents of the internal buffer
212 if (utf8buflen > 0) {
213 while (len < maxlen && utf8bufhere < utf8buflen) {
214 *uoutput = utf8buf[utf8bufhere];
215 uoutput++;
216 len++;
217 utf8bufhere++;
218 }
219
220 if (utf8bufhere == utf8buflen) {
221 utf8bufhere = 0;
222 utf8buflen = 0;
223 }
224 }
225
226 // fill up the buffer with the next character
227 if (utf8buflen == 0) {
228 if (texthere == textend) break; // finished!
229 if (!rzws || (*texthere != 0x200b))
230 utf8buflen = output_utf8_char (*texthere, utf8buf,
231 &utf8buf[MAXUTF8CHARLEN-1]);
232 texthere++;
233 utf8bufhere = 0;
234 }
235 }
236
237 if (texthere == textend && utf8buflen == 0) status = finished;
238 else status = unfinished;
239}
240
241
242
243
244
245
246mapdata_t::mapdata_t () {
247 int i;
248
249 // reset all the map ptrs to be NULL
250 for (i=0; i<256; i++) {
251 ptrs[i] = (unsigned short *)NULL;
252 }
253
254 // say nothing has been loaded
255 loaded = false;
256}
257
258
259mapconvert::mapconvert () {
260 absentc = 0;
261}
262
263// setmapfile will cause loadmapfile to be called when conversion is
264// needed
265bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
266 unsigned short theabsentc) {
267 // check to see if the mapfile has been already loaded
268 if (mapdata.loaded && gsdlhome == thegsdlhome &&
269 encoding == theencoding && absentc == theabsentc)
270 return true;
271
272 unloadmapfile ();
273 gsdlhome = thegsdlhome;
274 encoding = theencoding;
275 absentc = theabsentc;
276
277 return true;
278}
279
280
281
282// loadmapfile should be called before any conversion is done
283bool mapconvert::loadmapfile (const text_t &thegsdlhome,
284 const text_t &theencoding,
285 unsigned short theabsentc) {
286 FILE *mapfilein = (FILE *)NULL;
287
288 // check to see if the mapfile has been already loaded
289 if (mapdata.loaded && gsdlhome == thegsdlhome &&
290 encoding == theencoding && absentc == theabsentc)
291 return true;
292
293 unloadmapfile ();
294 gsdlhome = thegsdlhome;
295 encoding = theencoding;
296 absentc = theabsentc;
297
298 // open the map file
299 text_t filename = filename_cat (gsdlhome, "unicode");
300 filename = filename_cat (filename, encoding);
301 filename += ".ump";
302 char *cfilename = filename.getcstr();
303 if (cfilename == (char *)NULL) return false;
304 mapfilein = fopen(cfilename, "rb");
305 delete cfilename;
306
307 if (mapfilein == (FILE *)NULL) return false;
308
309 unsigned char c, n1, n2;
310 unsigned short *arrptr;
311 int i;
312 c = fgetc (mapfilein);
313 while (!feof (mapfilein)) {
314 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
315 // allocate a new array
316 arrptr = new unsigned short[256];
317 mapdata.ptrs[c] = arrptr;
318 } else arrptr = mapdata.ptrs[c];
319
320 // clear the array
321 for (i=0; i<256; i++) arrptr[i] = 0;
322
323 // read in this block
324 n1 = fgetc (mapfilein);
325 n2 = fgetc (mapfilein);
326 i=0;
327 while (!feof (mapfilein)) {
328 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
329
330 i++;
331 if (i >= 256) break;
332 n1 = fgetc (mapfilein);
333 n2 = fgetc (mapfilein);
334 }
335
336 c = fgetc (mapfilein);
337 }
338
339 mapdata.loaded = true;
340
341 return true;
342}
343
344void mapconvert::unloadmapfile () {
345 if (!mapdata.loaded) return;
346
347 int i;
348 for (i=0; i<256; i++) {
349 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
350 delete [] mapdata.ptrs[i];
351 mapdata.ptrs[i] = (unsigned short *)NULL;
352 }
353 }
354
355 mapdata.loaded = false;
356}
357
358
359unsigned short mapconvert::convert (unsigned short c) {
360 if (!mapdata.loaded) {
361 if (!gsdlhome.empty() && !encoding.empty() &&
362 loadmapfile (gsdlhome, encoding, absentc)) {
363 // do nothing, successfully loaded database
364 } else return absentc;
365 }
366
367 if (c == 0) return 0; // 0 always maps to 0...
368
369 unsigned short n1 = c >> 8;
370 unsigned short n2 = c & 0xff;
371
372 unsigned short *arrptr = mapdata.ptrs[n1];
373 if (arrptr == (unsigned short *)NULL) return absentc;
374
375 if (arrptr[n2] == 0) return absentc;
376 return arrptr[n2];
377}
378
379text_t mapconvert::convert (const text_t &instr) {
380 if (!mapdata.loaded) return absentc;
381
382 text_t outstr;
383 text_t::const_iterator here = instr.begin();
384 text_t::const_iterator end = instr.end();
385
386 while (here != end) {
387 outstr.push_back(this->convert(*here));
388 here++;
389 }
390
391 return outstr;
392}
393
394
395
396
397mapinconvertclass::mapinconvertclass () {
398 mapbuflen = 0;
399}
400
401void mapinconvertclass::reset () {
402 start = NULL;
403 len = 0;
404 mapbuflen=0;
405}
406
407void mapinconvertclass::convert (text_t &output, status_t &status) {
408 output.clear();
409
410 if (start == NULL || len == 0) {
411 if (mapbuflen == 0) status = finished;
412 else status = stopped;
413 return;
414 }
415
416 // don't want any funny sign conversions happening
417 unsigned char *here = (unsigned char *)start;
418
419 size_t charlen = getmapcharlen ();
420 while (len > 0) {
421 if (charlen == 0) {
422 // start parsing a new character
423 mapbuflen = 0;
424 mapbuf[mapbuflen++] = *here;
425 ++here;
426 --len;
427 charlen = getmapcharlen ();
428
429 } else if (mapbuflen < charlen) {
430 // assumes charlen is always less than MAXMAPCHARLEN
431 mapbuf[mapbuflen++] = *here;
432 ++here;
433 --len;
434 }
435
436 if (mapbuflen == charlen) {
437 // got a complete character
438 if (charlen == 1) {
439 // ascii character
440 output.push_back (mapbuf[0]);
441
442 } else {
443 // two byte character
444 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
445 (unsigned short)mapbuf[1]));
446 }
447
448 mapbuflen = 0;
449 charlen = 0;
450 }
451 }
452
453 start = (char *)here; // save current position
454
455 if (mapbuflen == 0) status = finished;
456 else status = stopped;
457}
458
459
460
461mapoutconvertclass::mapoutconvertclass () {
462 mapbuflen=0;
463 mapbufhere=0;
464}
465
466void mapoutconvertclass::reset () {
467 input = NULL;
468 outs = NULL;
469 mapbuflen = 0;
470 mapbufhere = 0;
471}
472
473// note that convert does not null-terminate the
474// output array of characters
475void mapoutconvertclass::convert (char *output, size_t maxlen,
476 size_t &len, status_t &status) {
477 unsigned short outc;
478
479 if (input == NULL || output == NULL) {
480 if (mapbuflen == 0) status = finished;
481 else status = unfinished;
482 return;
483 }
484
485 // don't want any funny sign conversions happening
486 unsigned char *uoutput = (unsigned char *)output;
487 text_t::iterator textend = input->end();
488 len = 0;
489 while (len < maxlen) {
490 // empty the contents of the internal buffer
491 if (mapbuflen > 0) {
492 while (len < maxlen && mapbufhere < mapbuflen) {
493 *uoutput = mapbuf[mapbufhere];
494 uoutput++;
495 len++;
496 mapbufhere++;
497 }
498
499 if (mapbufhere == mapbuflen) {
500 mapbufhere = 0;
501 mapbuflen = 0;
502 }
503 }
504
505 // fill up the buffer with the next character
506 if (mapbuflen == 0) {
507 if (texthere == textend) break; // finished!
508 if (!rzws || (*texthere != 0x200b)) {
509 if (*texthere < 0x80) {
510 mapbuf[0] = (unsigned char)*texthere;
511 mapbuflen = 1;
512 } else {
513 outc = converter.convert (*texthere);
514 mapbuf[0] = (unsigned char)(outc >> 8);
515 mapbuf[1] = (unsigned char)(outc & 0xff);
516 mapbuflen = 2;
517 }
518 }
519
520 texthere++;
521 mapbufhere = 0;
522 }
523 }
524
525 if (texthere == textend && mapbuflen == 0) status = finished;
526 else status = unfinished;
527}
528
529
530bool simplemapconvert::loadmapfile (bool in) {
531 if (loaded) return true;
532 if (mapfile.empty()) return false;
533
534 char *cfilename = mapfile.getcstr();
535#ifdef GSDL_USE_IOS_H
536 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
537#else
538 ifstream mapfilein (cfilename, ios::in);
539#endif
540 delete cfilename;
541 if (!mapfilein) return false;
542
543 char cline[2048];
544 text_t line;
545
546 while (!mapfilein.eof()) {
547 mapfilein.getline (cline, 2048);
548 line.clear();
549 line.appendcstr (cline);
550 if (line.empty()) continue;
551 // remove comments
552 text_t::iterator end = line.end();
553 text_t::iterator here = findchar (line.begin(), end, '#');
554 if (here != end) {
555 line.erase (here, end);
556 if (line.empty()) continue;
557 }
558
559 text_tarray parts;
560 splitchar (line.begin(), line.end(), '\t', parts);
561
562 // do some simple sanity checks
563 if (parts.size() < 2) continue;
564 text_t::iterator begin1 = parts[0].begin();
565 text_t::iterator begin2 = parts[1].begin();
566 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
567 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
568 char *from = parts[0].getcstr();
569 char *to = parts[1].getcstr();
570 unsigned int f = 0, t = 0;
571 sscanf (from, "%i", &f);
572 sscanf (to, "%i", &t);
573 delete from;
574 delete to;
575
576 if (in) mapping[(unsigned short)f] = (unsigned short)t;
577 else mapping[(unsigned short)t] = (unsigned short)f;
578 }
579
580 loaded = true;
581 return true;
582}
583
584unsigned short simplemapconvert::convert (unsigned short c, bool in) {
585
586 if (!loaded)
587 if (!loadmapfile(in)) return absentc;
588
589 return mapping[c];
590}
591
592
593void simplemapinconvertclass::convert (text_t &output, status_t &status) {
594 output.clear();
595
596 if (start == NULL || len == 0) {
597 status = finished;
598 return;
599 }
600
601 // don't want any funny sign conversions happening
602 unsigned char *here = (unsigned char *)start;
603 while (len > 0) {
604
605 if (*here < 0x80)
606 output.push_back (*here); // append this character
607 else
608 output.push_back (converter.convert(*here, true));
609
610 ++here;
611 --len;
612 }
613
614 start = (char *)here; // save current position
615 status = finished;
616}
617
618
619void simplemapoutconvertclass::convert (char *output, size_t maxlen,
620 size_t &len, status_t &status) {
621
622 if (input == NULL || output == NULL) {
623 status = finished;
624 return;
625 }
626
627 // don't want any funny sign conversions happening
628 unsigned char *uoutput = (unsigned char *)output;
629 text_t::iterator textend = input->end();
630 len = 0;
631 while ((len < maxlen) && (texthere != textend)) {
632
633 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
634 else *uoutput = converter.convert (*texthere, false);
635
636 ++uoutput;
637 ++len;
638 ++texthere;
639 }
640
641 if (texthere == textend) status = finished;
642 else status = unfinished;
643}
Note: See TracBrowser for help on using the repository browser.