source: gsdl/trunk/lib/gsdlunicode.cpp@ 15679

Last change on this file since 15679 was 8727, checked in by kjdon, 19 years ago

added some changes made by Emanuel Dejanu (Simple Words)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.6 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38# include <ospace\std\iostream>
39# include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41# include <iostream.h>
42# include <fstream.h>
43#else
44# include <iostream>
45# include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52 text_t out;
53
54 unsigned char thischar[MAXUTF8CHARLEN];
55 int i, charlen;
56
57 while (here != end) {
58 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59 for (i=0; i<charlen; ++i) out.push_back(thischar[i]);
60 ++here;
61 }
62
63 return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69 text_t out;
70 unsigned char *in_cstr = (unsigned char *)in.getcstr();
71 unsigned char *here = in_cstr;
72 unsigned char *end = in_cstr;
73
74 unsigned short unichar;
75 int charlen = 0;
76
77 // get the last valid character in the string
78 while (*end != '\0') ++end;
79 --end;
80
81 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82 out.push_back(unichar);
83 here += charlen;
84 }
85
86 delete []in_cstr;
87
88 return out;
89}
90
91
92// this works for all unicode values < 65536...
93void utf16outconvertclass::convert (char *out, size_t maxlen, size_t &len, status_t &status) {
94 // we should already have text_t* input set...
95 if (input == NULL || out == NULL)
96 {
97 status = finished;
98 return;
99 }
100 unsigned char *output = (unsigned char *)out;
101 text_t::iterator textend = input->end();
102 len = 0;
103 if (maxlen % 2) --maxlen; // we need an even number of output bytes...
104 while ((len < maxlen) && (texthere != textend)) {
105 unsigned short int uni_char=(unsigned short int) *texthere;
106 // big endian utf-16...
107 if (uni_char < 256) {
108 out[len]=0;
109 out[len+1]=uni_char;
110 } else {
111 out[len]=uni_char >> 8;
112 out[len+1]=uni_char & 255;
113 }
114 len+=2;
115 ++texthere;
116 }
117 if (texthere==textend)
118 status=finished;
119 else
120 status=unfinished;
121}
122
123
124utf8inconvertclass::utf8inconvertclass () {
125 utf8buflen = 0;
126}
127
128void utf8inconvertclass::reset () {
129 start = NULL;
130 len = 0;
131 utf8buflen=0;
132}
133
134void utf8inconvertclass::convert (text_t &output, status_t &status) {
135 output.clear();
136 output.reserve (len/3);
137
138 if (start == NULL || len == 0) {
139 if (utf8buflen == 0) status = finished;
140 else status = stopped;
141 return;
142 }
143
144 // don't want any funny sign conversions happening
145 unsigned char *here = (unsigned char *)start;
146 unsigned char *end = here+len-1;
147 unsigned short c;
148 size_t realcharlen;
149
150 size_t charlen = getutf8charlen ();
151 while (len > 0) {
152 if (charlen == 0) {
153 // start parsing a new character
154 utf8buflen = 0;
155
156 // fast common case
157 while (len > 3) {
158 realcharlen = parse_utf8_char (here, end, &c);
159 output.push_back (c);
160 here += realcharlen;
161 len -= realcharlen;
162 }
163
164 utf8buf[utf8buflen++] = *here;
165 ++here;
166 --len;
167 charlen = getutf8charlen ();
168
169 } else if (utf8buflen < charlen) {
170 // assumes charlen is always less than MAXUTF8CHARLEN
171 utf8buf[utf8buflen++] = *here;
172 ++here;
173 --len;
174 }
175
176 if (utf8buflen == charlen) {
177 // got a complete character
178 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
179 output.push_back (c);
180
181 // move any unparsed characters. If an error occurred some of
182 // the characters might be unused.
183 int i;
184 int diff = utf8buflen - realcharlen;
185 for (i=0; i < diff; ++i) utf8buf[i] = utf8buf[i+diff];
186 utf8buflen = diff;
187 charlen = getutf8charlen ();
188 }
189 }
190
191 start = (char *)here; // save current position
192
193 if (utf8buflen == 0) status = finished;
194 else status = stopped;
195}
196
197
198// returns the length that the current contents of the
199// utf8buf should be
200size_t utf8inconvertclass::getutf8charlen () {
201 if (utf8buflen == 0) return 0;
202
203 // one byte character
204 if (utf8buf[0] < 0x80) return 1;
205
206 // error, is not the start of a utf-8 character
207 if (utf8buf[0] < 0xc0) return 1;
208
209 // two bute character
210 if (utf8buf[0] < 0xe0) return 2;
211
212 // three byte character
213 if (utf8buf[0] < 0xf0) return 3;
214
215 // error, character too long for unicode
216 return 1;
217}
218
219
220void utf8outconvertclass::reset () {
221 input = NULL;
222 outs = NULL;
223 utf8buflen = 0;
224 utf8bufhere = 0;
225}
226
227// note that convert does not null-terminate the
228// output array of characters
229void utf8outconvertclass::convert (char *output, size_t maxlen,
230 size_t &len, status_t &status) {
231 if (input == NULL || output == NULL) {
232 if (utf8buflen == 0) status = finished;
233 else status = unfinished;
234 return;
235 }
236
237 // don't want any funny sign conversions happening
238 unsigned char *uoutput = (unsigned char *)output;
239 text_t::iterator textend = input->end();
240 len = 0;
241 while (len < maxlen) {
242 // empty the contents of the internal buffer
243 if (utf8buflen > 0) {
244 while (len < maxlen && utf8bufhere < utf8buflen) {
245 *uoutput = utf8buf[utf8bufhere];
246 ++uoutput;
247 ++len;
248 ++utf8bufhere;
249 }
250
251 if (utf8bufhere == utf8buflen) {
252 utf8bufhere = 0;
253 utf8buflen = 0;
254 }
255 }
256
257 // fill up the buffer with the next character
258 if (utf8buflen == 0) {
259 if (texthere == textend) break; // finished!
260 if (!rzws || (*texthere != 0x200b))
261 utf8buflen = output_utf8_char (*texthere, utf8buf,
262 &utf8buf[MAXUTF8CHARLEN-1]);
263 ++texthere;
264 utf8bufhere = 0;
265 }
266 }
267
268 if (texthere == textend && utf8buflen == 0) status = finished;
269 else status = unfinished;
270}
271
272
273
274
275
276
277mapdata_t::mapdata_t () {
278
279 // reset all the map ptrs to be NULL
280 for (int i=0; i<256; ++i) {
281 ptrs[i] = (unsigned short *)NULL;
282 }
283
284 // say nothing has been loaded
285 loaded = false;
286}
287
288
289mapconvert::mapconvert () {
290 absentc = 0;
291}
292
293// setmapfile will cause loadmapfile to be called when conversion is
294// needed
295bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
296 // check to see if the mapfile has been already loaded
297 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
298
299 unloadmapfile ();
300 mapfile = themapfile;
301 absentc = theabsentc;
302
303 return true;
304}
305
306
307
308// loadmapfile should be called before any conversion is done
309bool mapconvert::loadmapfile (const text_t &themapfile,
310 unsigned short theabsentc) {
311 FILE *mapfilein = (FILE *)NULL;
312
313 // check to see if the mapfile has been already loaded
314 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
315
316 unloadmapfile ();
317 mapfile = themapfile;
318 absentc = theabsentc;
319
320 // open the map file
321 char *cfilename = mapfile.getcstr();
322 if (cfilename == (char *)NULL) return false;
323 mapfilein = fopen(cfilename, "rb");
324 delete []cfilename; cfilename = NULL;
325
326 if (mapfilein == (FILE *)NULL) return false;
327
328 unsigned char c, n1, n2;
329 unsigned short *arrptr;
330 int i;
331 c = fgetc (mapfilein);
332 while (!feof (mapfilein)) {
333 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
334 // allocate a new array
335 arrptr = new unsigned short[256];
336 mapdata.ptrs[c] = arrptr;
337 } else arrptr = mapdata.ptrs[c];
338
339 // clear the array
340 for (i=0; i<256; ++i) arrptr[i] = 0;
341
342 // read in this block
343 n1 = fgetc (mapfilein);
344 n2 = fgetc (mapfilein);
345 i=0;
346 while (!feof (mapfilein)) {
347 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
348
349 ++i;
350 if (i >= 256) break;
351 n1 = fgetc (mapfilein);
352 n2 = fgetc (mapfilein);
353 }
354
355 c = fgetc (mapfilein);
356 }
357
358 mapdata.loaded = true;
359
360 return true;
361}
362
363void mapconvert::unloadmapfile () {
364 if (!mapdata.loaded) return;
365
366 for (int i=0; i<256; ++i) {
367 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
368 delete [] mapdata.ptrs[i];
369 mapdata.ptrs[i] = (unsigned short *)NULL;
370 }
371 }
372
373 mapdata.loaded = false;
374}
375
376
377unsigned short mapconvert::convert (unsigned short c) {
378 if (!mapdata.loaded) {
379 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
380 // do nothing, successfully loaded database
381 } else return absentc;
382 }
383
384 if (c == 0) return 0; // 0 always maps to 0...
385
386 unsigned short n1 = c >> 8;
387 unsigned short n2 = c & 0xff;
388
389 unsigned short *arrptr = mapdata.ptrs[n1];
390 if (arrptr == (unsigned short *)NULL) return absentc;
391
392 if (arrptr[n2] == 0) return absentc;
393 return arrptr[n2];
394}
395
396text_t mapconvert::convert (const text_t &instr) {
397 if (!mapdata.loaded) return absentc;
398
399 text_t outstr;
400 text_t::const_iterator here = instr.begin();
401 text_t::const_iterator end = instr.end();
402
403 while (here != end) {
404 outstr.push_back(this->convert(*here));
405 ++here;
406 }
407
408 return outstr;
409}
410
411
412
413
414mapinconvertclass::mapinconvertclass () {
415 m_multibyte = 0;
416 mapbuflen = 0;
417}
418
419void mapinconvertclass::reset () {
420 start = NULL;
421 len = 0;
422 mapbuflen=0;
423}
424
425void mapinconvertclass::convert (text_t &output, status_t &status) {
426 output.clear();
427
428 if (start == NULL || len == 0) {
429 if (mapbuflen == 0) status = finished;
430 else status = stopped;
431 return;
432 }
433
434 // don't want any funny sign conversions happening
435 unsigned char *here = (unsigned char *)start;
436
437 size_t charlen = getmapcharlen ();
438 while (len > 0) {
439 if (charlen == 0) {
440 // start parsing a new character
441 mapbuflen = 0;
442 mapbuf[mapbuflen++] = *here;
443 ++here;
444 --len;
445 charlen = getmapcharlen ();
446
447 } else if (mapbuflen < charlen) {
448 // assumes charlen is always less than MAXMAPCHARLEN
449 mapbuf[mapbuflen++] = *here;
450 ++here;
451 --len;
452 }
453
454 if (mapbuflen == charlen) {
455 // got a complete character
456 if (charlen == 1) {
457 if (mapbuf[0] < 0x80) {
458 // ascii character
459 output.push_back (mapbuf[0]);
460 } else {
461 output.push_back (converter.convert((unsigned short)mapbuf[0]));
462 }
463
464 } else {
465 // two byte character
466 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
467 (unsigned short)mapbuf[1]));
468 }
469
470 mapbuflen = 0;
471 charlen = 0;
472 }
473 }
474
475 start = (char *)here; // save current position
476
477 if (mapbuflen == 0) status = finished;
478 else status = stopped;
479}
480
481
482
483mapoutconvertclass::mapoutconvertclass () {
484 m_multibyte = 0;
485 mapbuflen=0;
486 mapbufhere=0;
487}
488
489void mapoutconvertclass::reset () {
490 input = NULL;
491 outs = NULL;
492 mapbuflen = 0;
493 mapbufhere = 0;
494}
495
496// note that convert does not null-terminate the
497// output array of characters
498void mapoutconvertclass::convert (char *output, size_t maxlen,
499 size_t &len, status_t &status) {
500 unsigned short outc;
501
502 if (input == NULL || output == NULL) {
503 if (mapbuflen == 0) status = finished;
504 else status = unfinished;
505 return;
506 }
507
508 // don't want any funny sign conversions happening
509 unsigned char *uoutput = (unsigned char *)output;
510 text_t::iterator textend = input->end();
511 len = 0;
512 while (len < maxlen) {
513 // empty the contents of the internal buffer
514 if (mapbuflen > 0) {
515 while (len < maxlen && mapbufhere < mapbuflen) {
516 *uoutput = mapbuf[mapbufhere];
517 ++uoutput;
518 ++len;
519 ++mapbufhere;
520 }
521
522 if (mapbufhere == mapbuflen) {
523 mapbufhere = 0;
524 mapbuflen = 0;
525 }
526 }
527
528 // fill up the buffer with the next character
529 if (mapbuflen == 0) {
530 if (texthere == textend) break; // finished!
531 if (!rzws || (*texthere != 0x200b)) {
532 if (*texthere < 0x80) {
533 mapbuf[0] = (unsigned char)*texthere;
534 mapbuflen = 1;
535 } else {
536 outc = converter.convert (*texthere);
537 if (m_multibyte) {
538 mapbuf[0] = (unsigned char)(outc >> 8);
539 mapbuf[1] = (unsigned char)(outc & 0xff);
540 mapbuflen = 2;
541 } else {
542 mapbuf[0] = outc;
543 mapbuflen = 1;
544 }
545 }
546 }
547
548 ++texthere;
549 mapbufhere = 0;
550 }
551 }
552
553 if (texthere == textend && mapbuflen == 0) status = finished;
554 else status = unfinished;
555}
556
557
558bool simplemapconvert::loadmapfile (bool in) {
559 if (loaded) return true;
560 if (mapfile.empty()) return false;
561
562 char *cfilename = mapfile.getcstr();
563#ifdef GSDL_USE_IOS_H
564 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
565#else
566 ifstream mapfilein (cfilename, ios::in);
567#endif
568 delete []cfilename;
569 if (!mapfilein) return false;
570
571 char cline[2048];
572 text_t line;
573
574 while (!mapfilein.eof()) {
575 mapfilein.getline (cline, 2048);
576 line.clear();
577 line.appendcstr (cline);
578 if (line.empty()) continue;
579 // remove comments
580 text_t::iterator end = line.end();
581 text_t::iterator here = findchar (line.begin(), end, '#');
582 if (here != end) {
583 line.erase (here, end);
584 if (line.empty()) continue;
585 }
586
587 text_tarray parts;
588 splitchar (line.begin(), line.end(), '\t', parts);
589
590 // do some simple sanity checks
591 if (parts.size() < 2) continue;
592 text_t::iterator begin1 = parts[0].begin();
593 text_t::iterator begin2 = parts[1].begin();
594 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
595 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
596 char *from = parts[0].getcstr();
597 char *to = parts[1].getcstr();
598 unsigned int f = 0, t = 0;
599 sscanf (from, "%i", &f);
600 sscanf (to, "%i", &t);
601 delete []from;
602 delete []to;
603
604 if (in) mapping[(unsigned short)f] = (unsigned short)t;
605 else mapping[(unsigned short)t] = (unsigned short)f;
606 }
607
608 loaded = true;
609 return true;
610}
611
612unsigned short simplemapconvert::convert (unsigned short c, bool in) {
613
614 if (!loaded)
615 if (!loadmapfile(in)) return absentc;
616
617 return mapping[c];
618}
619
620
621void simplemapinconvertclass::convert (text_t &output, status_t &status) {
622 output.clear();
623
624 if (start == NULL || len == 0) {
625 status = finished;
626 return;
627 }
628
629 // don't want any funny sign conversions happening
630 unsigned char *here = (unsigned char *)start;
631 while (len > 0) {
632
633 if (*here < 0x80)
634 output.push_back (*here); // append this character
635 else
636 output.push_back (converter.convert(*here, true));
637
638 ++here;
639 --len;
640 }
641
642 start = (char *)here; // save current position
643 status = finished;
644}
645
646
647void simplemapoutconvertclass::convert (char *output, size_t maxlen,
648 size_t &len, status_t &status) {
649
650 if (input == NULL || output == NULL) {
651 status = finished;
652 return;
653 }
654
655 // don't want any funny sign conversions happening
656 unsigned char *uoutput = (unsigned char *)output;
657 text_t::iterator textend = input->end();
658 len = 0;
659 while ((len < maxlen) && (texthere != textend)) {
660
661 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
662 else *uoutput = converter.convert (*texthere, false);
663
664 ++uoutput;
665 ++len;
666 ++texthere;
667 }
668
669 if (texthere == textend) status = finished;
670 else status = unfinished;
671}
Note: See TracBrowser for help on using the repository browser.