source: main/trunk/greenstone2/common-src/src/lib/gsdlunicode.cpp@ 24162

Last change on this file since 24162 was 22141, checked in by davidb, 14 years ago

Was surprised to discover some classes that did not correctly specify virtual on its destructor, even though virutal was being used on other methods in the class, or else through inheritance. Now fixed up.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31// A copy of mgpp's unitool has now been moved into common-src/src/lib/
32#include "unitool.h"
33
34#include "fileutil.h"
35
36#include <stdio.h>
37
38#if defined(GSDL_USE_OBJECTSPACE)
39# include <ospace\std\iostream>
40# include <ospace\std\fstream>
41#elif defined(GSDL_USE_IOS_H)
42# include <iostream.h>
43# include <fstream.h>
44#else
45# include <iostream>
46# include <fstream>
47#endif
48
49
50// converts a unicode encode text_t string to a utf-8
51// encoded text_t string
52text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
53 text_t out;
54
55 unsigned char thischar[MAXUTF8CHARLEN];
56 int i, charlen;
57
58 while (here != end) {
59 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
60 for (i=0; i<charlen; ++i) out.push_back(thischar[i]);
61 ++here;
62 }
63
64 return out;
65}
66
67// converts a utf-8 encoded text_t string to a unicode
68// encoded text_t string
69text_t to_uni (const text_t &in) {
70 text_t out;
71 unsigned char *in_cstr = (unsigned char *)in.getcstr();
72 unsigned char *here = in_cstr;
73 unsigned char *end = in_cstr;
74
75 unsigned short unichar;
76 int charlen = 0;
77
78 // get the last valid character in the string
79 while (*end != '\0') ++end;
80 --end;
81
82 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
83 out.push_back(unichar);
84 here += charlen;
85 }
86
87 delete []in_cstr;
88
89 return out;
90}
91
92
93// this works for all unicode values < 65536...
94void utf16outconvertclass::convert (char *out, size_t maxlen, size_t &len, status_t &status) {
95 // we should already have text_t* input set...
96 if (input == NULL || out == NULL)
97 {
98 status = finished;
99 return;
100 }
101 unsigned char *output = (unsigned char *)out;
102 text_t::iterator textend = input->end();
103 len = 0;
104 if (maxlen % 2) --maxlen; // we need an even number of output bytes...
105 while ((len < maxlen) && (texthere != textend)) {
106 unsigned short int uni_char=(unsigned short int) *texthere;
107 // big endian utf-16...
108 if (uni_char < 256) {
109 out[len]=0;
110 out[len+1]=uni_char;
111 } else {
112 out[len]=uni_char >> 8;
113 out[len+1]=uni_char & 255;
114 }
115 len+=2;
116 ++texthere;
117 }
118 if (texthere==textend)
119 status=finished;
120 else
121 status=unfinished;
122}
123
124
125utf8inconvertclass::utf8inconvertclass () {
126 utf8buflen = 0;
127}
128
129utf8inconvertclass::~utf8inconvertclass () {
130 // nothing to do
131}
132
133void utf8inconvertclass::reset () {
134 start = NULL;
135 len = 0;
136 utf8buflen=0;
137}
138
139void utf8inconvertclass::convert (text_t &output, status_t &status) {
140 output.clear();
141 output.reserve (len/3);
142
143 if (start == NULL || len == 0) {
144 if (utf8buflen == 0) status = finished;
145 else status = stopped;
146 return;
147 }
148
149 // don't want any funny sign conversions happening
150 unsigned char *here = (unsigned char *)start;
151 unsigned char *end = here+len-1;
152 unsigned short c;
153 size_t realcharlen;
154
155 size_t charlen = getutf8charlen ();
156 while (len > 0) {
157 if (charlen == 0) {
158 // start parsing a new character
159 utf8buflen = 0;
160
161 // fast common case
162 while (len > 3) {
163 realcharlen = parse_utf8_char (here, end, &c);
164 output.push_back (c);
165 here += realcharlen;
166 len -= realcharlen;
167 }
168
169 utf8buf[utf8buflen++] = *here;
170 ++here;
171 --len;
172 charlen = getutf8charlen ();
173
174 } else if (utf8buflen < charlen) {
175 // assumes charlen is always less than MAXUTF8CHARLEN
176 utf8buf[utf8buflen++] = *here;
177 ++here;
178 --len;
179 }
180
181 if (utf8buflen == charlen) {
182 // got a complete character
183 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
184 output.push_back (c);
185
186 // move any unparsed characters. If an error occurred some of
187 // the characters might be unused.
188 int i;
189 int diff = utf8buflen - realcharlen;
190 for (i=0; i < diff; ++i) utf8buf[i] = utf8buf[i+diff];
191 utf8buflen = diff;
192 charlen = getutf8charlen ();
193 }
194 }
195
196 start = (char *)here; // save current position
197
198 if (utf8buflen == 0) status = finished;
199 else status = stopped;
200}
201
202
203// returns the length that the current contents of the
204// utf8buf should be
205size_t utf8inconvertclass::getutf8charlen () {
206 if (utf8buflen == 0) return 0;
207
208 // one byte character
209 if (utf8buf[0] < 0x80) return 1;
210
211 // error, is not the start of a utf-8 character
212 if (utf8buf[0] < 0xc0) return 1;
213
214 // two bute character
215 if (utf8buf[0] < 0xe0) return 2;
216
217 // three byte character
218 if (utf8buf[0] < 0xf0) return 3;
219
220 // error, character too long for unicode
221 return 1;
222}
223
224
225void utf8outconvertclass::reset () {
226 input = NULL;
227 outs = NULL;
228 utf8buflen = 0;
229 utf8bufhere = 0;
230}
231
232// note that convert does not null-terminate the
233// output array of characters
234void utf8outconvertclass::convert (char *output, size_t maxlen,
235 size_t &len, status_t &status) {
236 if (input == NULL || output == NULL) {
237 if (utf8buflen == 0) status = finished;
238 else status = unfinished;
239 return;
240 }
241
242 // don't want any funny sign conversions happening
243 unsigned char *uoutput = (unsigned char *)output;
244 text_t::iterator textend = input->end();
245 len = 0;
246 while (len < maxlen) {
247 // empty the contents of the internal buffer
248 if (utf8buflen > 0) {
249 while (len < maxlen && utf8bufhere < utf8buflen) {
250 *uoutput = utf8buf[utf8bufhere];
251 ++uoutput;
252 ++len;
253 ++utf8bufhere;
254 }
255
256 if (utf8bufhere == utf8buflen) {
257 utf8bufhere = 0;
258 utf8buflen = 0;
259 }
260 }
261
262 // fill up the buffer with the next character
263 if (utf8buflen == 0) {
264 if (texthere == textend) break; // finished!
265 if (!rzws || (*texthere != 0x200b))
266 utf8buflen = output_utf8_char (*texthere, utf8buf,
267 &utf8buf[MAXUTF8CHARLEN-1]);
268 ++texthere;
269 utf8bufhere = 0;
270 }
271 }
272
273 if (texthere == textend && utf8buflen == 0) status = finished;
274 else status = unfinished;
275}
276
277
278
279
280
281
282mapdata_t::mapdata_t () {
283
284 // reset all the map ptrs to be NULL
285 for (int i=0; i<256; ++i) {
286 ptrs[i] = (unsigned short *)NULL;
287 }
288
289 // say nothing has been loaded
290 loaded = false;
291}
292
293
294mapconvert::mapconvert () {
295 absentc = 0;
296}
297
298// setmapfile will cause loadmapfile to be called when conversion is
299// needed
300bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
301 // check to see if the mapfile has been already loaded
302 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
303
304 unloadmapfile ();
305 mapfile = themapfile;
306 absentc = theabsentc;
307
308 return true;
309}
310
311
312
313// loadmapfile should be called before any conversion is done
314bool mapconvert::loadmapfile (const text_t &themapfile,
315 unsigned short theabsentc) {
316 FILE *mapfilein = (FILE *)NULL;
317
318 // check to see if the mapfile has been already loaded
319 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
320
321 unloadmapfile ();
322 mapfile = themapfile;
323 absentc = theabsentc;
324
325 // open the map file
326 char *cfilename = mapfile.getcstr();
327 if (cfilename == (char *)NULL) return false;
328 mapfilein = fopen(cfilename, "rb");
329 delete []cfilename; cfilename = NULL;
330
331 if (mapfilein == (FILE *)NULL) return false;
332
333 unsigned char c, n1, n2;
334 unsigned short *arrptr;
335 int i;
336 c = fgetc (mapfilein);
337 while (!feof (mapfilein)) {
338 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
339 // allocate a new array
340 arrptr = new unsigned short[256];
341 mapdata.ptrs[c] = arrptr;
342 } else arrptr = mapdata.ptrs[c];
343
344 // clear the array
345 for (i=0; i<256; ++i) arrptr[i] = 0;
346
347 // read in this block
348 n1 = fgetc (mapfilein);
349 n2 = fgetc (mapfilein);
350 i=0;
351 while (!feof (mapfilein)) {
352 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
353
354 ++i;
355 if (i >= 256) break;
356 n1 = fgetc (mapfilein);
357 n2 = fgetc (mapfilein);
358 }
359
360 c = fgetc (mapfilein);
361 }
362
363 mapdata.loaded = true;
364
365 return true;
366}
367
368void mapconvert::unloadmapfile () {
369 if (!mapdata.loaded) return;
370
371 for (int i=0; i<256; ++i) {
372 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
373 delete [] mapdata.ptrs[i];
374 mapdata.ptrs[i] = (unsigned short *)NULL;
375 }
376 }
377
378 mapdata.loaded = false;
379}
380
381
382unsigned short mapconvert::convert (unsigned short c) {
383 if (!mapdata.loaded) {
384 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
385 // do nothing, successfully loaded database
386 } else return absentc;
387 }
388
389 if (c == 0) return 0; // 0 always maps to 0...
390
391 unsigned short n1 = c >> 8;
392 unsigned short n2 = c & 0xff;
393
394 unsigned short *arrptr = mapdata.ptrs[n1];
395 if (arrptr == (unsigned short *)NULL) return absentc;
396
397 if (arrptr[n2] == 0) return absentc;
398 return arrptr[n2];
399}
400
401text_t mapconvert::convert (const text_t &instr) {
402 if (!mapdata.loaded) return absentc;
403
404 text_t outstr;
405 text_t::const_iterator here = instr.begin();
406 text_t::const_iterator end = instr.end();
407
408 while (here != end) {
409 outstr.push_back(this->convert(*here));
410 ++here;
411 }
412
413 return outstr;
414}
415
416
417
418
419mapinconvertclass::mapinconvertclass () {
420 m_multibyte = 0;
421 mapbuflen = 0;
422}
423
424void mapinconvertclass::reset () {
425 start = NULL;
426 len = 0;
427 mapbuflen=0;
428}
429
430void mapinconvertclass::convert (text_t &output, status_t &status) {
431 output.clear();
432
433 if (start == NULL || len == 0) {
434 if (mapbuflen == 0) status = finished;
435 else status = stopped;
436 return;
437 }
438
439 // don't want any funny sign conversions happening
440 unsigned char *here = (unsigned char *)start;
441
442 size_t charlen = getmapcharlen ();
443 while (len > 0) {
444 if (charlen == 0) {
445 // start parsing a new character
446 mapbuflen = 0;
447 mapbuf[mapbuflen++] = *here;
448 ++here;
449 --len;
450 charlen = getmapcharlen ();
451
452 } else if (mapbuflen < charlen) {
453 // assumes charlen is always less than MAXMAPCHARLEN
454 mapbuf[mapbuflen++] = *here;
455 ++here;
456 --len;
457 }
458
459 if (mapbuflen == charlen) {
460 // got a complete character
461 if (charlen == 1) {
462 if (mapbuf[0] < 0x80) {
463 // ascii character
464 output.push_back (mapbuf[0]);
465 } else {
466 output.push_back (converter.convert((unsigned short)mapbuf[0]));
467 }
468
469 } else {
470 // two byte character
471 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
472 (unsigned short)mapbuf[1]));
473 }
474
475 mapbuflen = 0;
476 charlen = 0;
477 }
478 }
479
480 start = (char *)here; // save current position
481
482 if (mapbuflen == 0) status = finished;
483 else status = stopped;
484}
485
486
487
488mapoutconvertclass::mapoutconvertclass () {
489 m_multibyte = 0;
490 mapbuflen=0;
491 mapbufhere=0;
492}
493
494void mapoutconvertclass::reset () {
495 input = NULL;
496 outs = NULL;
497 mapbuflen = 0;
498 mapbufhere = 0;
499}
500
501// note that convert does not null-terminate the
502// output array of characters
503void mapoutconvertclass::convert (char *output, size_t maxlen,
504 size_t &len, status_t &status) {
505 unsigned short outc;
506
507 if (input == NULL || output == NULL) {
508 if (mapbuflen == 0) status = finished;
509 else status = unfinished;
510 return;
511 }
512
513 // don't want any funny sign conversions happening
514 unsigned char *uoutput = (unsigned char *)output;
515 text_t::iterator textend = input->end();
516 len = 0;
517 while (len < maxlen) {
518 // empty the contents of the internal buffer
519 if (mapbuflen > 0) {
520 while (len < maxlen && mapbufhere < mapbuflen) {
521 *uoutput = mapbuf[mapbufhere];
522 ++uoutput;
523 ++len;
524 ++mapbufhere;
525 }
526
527 if (mapbufhere == mapbuflen) {
528 mapbufhere = 0;
529 mapbuflen = 0;
530 }
531 }
532
533 // fill up the buffer with the next character
534 if (mapbuflen == 0) {
535 if (texthere == textend) break; // finished!
536 if (!rzws || (*texthere != 0x200b)) {
537 if (*texthere < 0x80) {
538 mapbuf[0] = (unsigned char)*texthere;
539 mapbuflen = 1;
540 } else {
541 outc = converter.convert (*texthere);
542 if (m_multibyte) {
543 mapbuf[0] = (unsigned char)(outc >> 8);
544 mapbuf[1] = (unsigned char)(outc & 0xff);
545 mapbuflen = 2;
546 } else {
547 mapbuf[0] = outc;
548 mapbuflen = 1;
549 }
550 }
551 }
552
553 ++texthere;
554 mapbufhere = 0;
555 }
556 }
557
558 if (texthere == textend && mapbuflen == 0) status = finished;
559 else status = unfinished;
560}
561
562
563bool simplemapconvert::loadmapfile (bool in) {
564 if (loaded) return true;
565 if (mapfile.empty()) return false;
566
567 char *cfilename = mapfile.getcstr();
568#ifdef GSDL_USE_IOS_H
569 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
570#else
571 ifstream mapfilein (cfilename, ios::in);
572#endif
573 delete []cfilename;
574 if (!mapfilein) return false;
575
576 char cline[2048];
577 text_t line;
578
579 while (!mapfilein.eof()) {
580 mapfilein.getline (cline, 2048);
581 line.clear();
582 line.appendcstr (cline);
583 if (line.empty()) continue;
584 // remove comments
585 text_t::iterator end = line.end();
586 text_t::iterator here = findchar (line.begin(), end, '#');
587 if (here != end) {
588 line.erase (here, end);
589 if (line.empty()) continue;
590 }
591
592 text_tarray parts;
593 splitchar (line.begin(), line.end(), '\t', parts);
594
595 // do some simple sanity checks
596 if (parts.size() < 2) continue;
597 text_t::iterator begin1 = parts[0].begin();
598 text_t::iterator begin2 = parts[1].begin();
599 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
600 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
601 char *from = parts[0].getcstr();
602 char *to = parts[1].getcstr();
603 unsigned int f = 0, t = 0;
604 sscanf (from, "%i", &f);
605 sscanf (to, "%i", &t);
606 delete []from;
607 delete []to;
608
609 if (in) mapping[(unsigned short)f] = (unsigned short)t;
610 else mapping[(unsigned short)t] = (unsigned short)f;
611 }
612
613 loaded = true;
614 return true;
615}
616
617unsigned short simplemapconvert::convert (unsigned short c, bool in) {
618
619 if (!loaded)
620 if (!loadmapfile(in)) return absentc;
621
622 return mapping[c];
623}
624
625
626void simplemapinconvertclass::convert (text_t &output, status_t &status) {
627 output.clear();
628
629 if (start == NULL || len == 0) {
630 status = finished;
631 return;
632 }
633
634 // don't want any funny sign conversions happening
635 unsigned char *here = (unsigned char *)start;
636 while (len > 0) {
637
638 if (*here < 0x80)
639 output.push_back (*here); // append this character
640 else
641 output.push_back (converter.convert(*here, true));
642
643 ++here;
644 --len;
645 }
646
647 start = (char *)here; // save current position
648 status = finished;
649}
650
651
652void simplemapoutconvertclass::convert (char *output, size_t maxlen,
653 size_t &len, status_t &status) {
654
655 if (input == NULL || output == NULL) {
656 status = finished;
657 return;
658 }
659
660 // don't want any funny sign conversions happening
661 unsigned char *uoutput = (unsigned char *)output;
662 text_t::iterator textend = input->end();
663 len = 0;
664 while ((len < maxlen) && (texthere != textend)) {
665
666 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
667 else *uoutput = converter.convert (*texthere, false);
668
669 ++uoutput;
670 ++len;
671 ++texthere;
672 }
673
674 if (texthere == textend) status = finished;
675 else status = unfinished;
676}
Note: See TracBrowser for help on using the repository browser.