source: main/tags/2.23/gsdl/lib/gsdlunicode.cpp@ 31150

Last change on this file since 31150 was 1236, checked in by nzdl, 24 years ago

fixed a couple of compiler warnings created by the new encoding stuff

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.0 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.cpp 1236 2000-06-23 05:03:29Z nzdl $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.14 2000/06/23 05:03:29 nzdl
31 fixed a couple of compiler warnings created by the new encoding stuff
32
33 Revision 1.13 2000/06/23 03:21:38 sjboddie
34 Created converter classes for simple 8 bit encodings that use a
35 simple textual map file. Instances of these classes are used to handle
36 the Windows 1256 (Arabic) encoding.
37
38 Revision 1.12 2000/04/06 19:58:02 cs025
39 Correcting a correction - reinstated all lib files due to silly
40 CVS confusion.
41
42 Revision 1.10 1999/09/07 04:57:43 sjboddie
43 added gpl notice
44
45 Revision 1.9 1999/07/21 07:23:17 rjmcnab
46 Added setmapfile function to map conversion utilities so the map file
47 does not need to be loaded when map conversion object is created.
48
49 Revision 1.8 1999/07/01 04:03:45 rjmcnab
50 Optimised utf8inconvertclass::convert slightly.
51
52 Revision 1.7 1999/06/30 04:59:03 rjmcnab
53 Added a to_utf8 function that takes iterators as input.
54
55 Revision 1.6 1999/06/26 01:05:04 rjmcnab
56 No real changes.
57
58 Revision 1.5 1999/01/12 01:50:59 rjmcnab
59
60 Standard header.
61
62 Revision 1.4 1999/01/08 02:33:15 rjmcnab
63
64 Added standard header to source files.
65
66 */
67
68
69#include "gsdlunicode.h"
70
71
72// unitool is currently in mg, if mg is not being used it should
73// be moved into GSDLHOME/lib
74#include "unitool.h"
75
76#include "fileutil.h"
77
78#include <stdio.h>
79
80#if defined(GSDL_USE_OBJECTSPACE)
81# include <ospace\std\iostream>
82# include <ospace\std\fstream>
83#elif defined(GSDL_USE_IOS_H)
84# include <iostream.h>
85# include <fstream.h>
86#else
87# include <iostream>
88# include <fstream>
89#endif
90
91
92// converts a unicode encode text_t string to a utf-8
93// encoded text_t string
94text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
95 text_t out;
96
97 unsigned char thischar[MAXUTF8CHARLEN];
98 int i, charlen;
99
100 while (here != end) {
101 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
102 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
103 here++;
104 }
105
106 return out;
107}
108
109// converts a utf-8 encoded text_t string to a unicode
110// encoded text_t string
111text_t to_uni (const text_t &in) {
112 text_t out;
113 unsigned char *in_cstr = (unsigned char *)in.getcstr();
114 unsigned char *here = in_cstr;
115 unsigned char *end = in_cstr;
116
117 unsigned short unichar;
118 int charlen = 0;
119
120 // get the last valid character in the string
121 while (*end != '\0') end++;
122 end--;
123
124 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
125 out.push_back(unichar);
126 here += charlen;
127 }
128
129 delete in_cstr;
130
131 return out;
132}
133
134
135
136utf8inconvertclass::utf8inconvertclass () {
137 utf8buflen = 0;
138}
139
140void utf8inconvertclass::reset () {
141 start = NULL;
142 len = 0;
143 utf8buflen=0;
144}
145
146void utf8inconvertclass::convert (text_t &output, status_t &status) {
147 output.clear();
148 output.reserve (len/3);
149
150 if (start == NULL || len == 0) {
151 if (utf8buflen == 0) status = finished;
152 else status = stopped;
153 return;
154 }
155
156 // don't want any funny sign conversions happening
157 unsigned char *here = (unsigned char *)start;
158 unsigned char *end = here+len-1;
159 unsigned short c;
160 size_t realcharlen;
161
162 size_t charlen = getutf8charlen ();
163 while (len > 0) {
164 if (charlen == 0) {
165 // start parsing a new character
166 utf8buflen = 0;
167
168 // fast common case
169 while (len > 3) {
170 realcharlen = parse_utf8_char (here, end, &c);
171 output.push_back (c);
172 here += realcharlen;
173 len -= realcharlen;
174 }
175
176 utf8buf[utf8buflen++] = *here;
177 ++here;
178 --len;
179 charlen = getutf8charlen ();
180
181 } else if (utf8buflen < charlen) {
182 // assumes charlen is always less than MAXUTF8CHARLEN
183 utf8buf[utf8buflen++] = *here;
184 ++here;
185 --len;
186 }
187
188 if (utf8buflen == charlen) {
189 // got a complete character
190 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
191 output.push_back (c);
192
193 // move any unparsed characters. If an error occurred some of
194 // the characters might be unused.
195 int i;
196 int diff = utf8buflen - realcharlen;
197 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
198 utf8buflen = diff;
199 charlen = getutf8charlen ();
200 }
201 }
202
203 start = (char *)here; // save current position
204
205 if (utf8buflen == 0) status = finished;
206 else status = stopped;
207}
208
209
210// returns the length that the current contents of the
211// utf8buf should be
212size_t utf8inconvertclass::getutf8charlen () {
213 if (utf8buflen == 0) return 0;
214
215 // one byte character
216 if (utf8buf[0] < 0x80) return 1;
217
218 // error, is not the start of a utf-8 character
219 if (utf8buf[0] < 0xc0) return 1;
220
221 // two bute character
222 if (utf8buf[0] < 0xe0) return 2;
223
224 // three byte character
225 if (utf8buf[0] < 0xf0) return 3;
226
227 // error, character too long for unicode
228 return 1;
229}
230
231
232void utf8outconvertclass::reset () {
233 input = NULL;
234 outs = NULL;
235 utf8buflen = 0;
236 utf8bufhere = 0;
237}
238
239// note that convert does not null-terminate the
240// output array of characters
241void utf8outconvertclass::convert (char *output, size_t maxlen,
242 size_t &len, status_t &status) {
243 if (input == NULL || output == NULL) {
244 if (utf8buflen == 0) status = finished;
245 else status = unfinished;
246 return;
247 }
248
249 // don't want any funny sign conversions happening
250 unsigned char *uoutput = (unsigned char *)output;
251 text_t::iterator textend = input->end();
252 len = 0;
253 while (len < maxlen) {
254 // empty the contents of the internal buffer
255 if (utf8buflen > 0) {
256 while (len < maxlen && utf8bufhere < utf8buflen) {
257 *uoutput = utf8buf[utf8bufhere];
258 uoutput++;
259 len++;
260 utf8bufhere++;
261 }
262
263 if (utf8bufhere == utf8buflen) {
264 utf8bufhere = 0;
265 utf8buflen = 0;
266 }
267 }
268
269 // fill up the buffer with the next character
270 if (utf8buflen == 0) {
271 if (texthere == textend) break; // finished!
272 if (!rzws || (*texthere != 0x200b))
273 utf8buflen = output_utf8_char (*texthere, utf8buf,
274 &utf8buf[MAXUTF8CHARLEN-1]);
275 texthere++;
276 utf8bufhere = 0;
277 }
278 }
279
280 if (texthere == textend && utf8buflen == 0) status = finished;
281 else status = unfinished;
282}
283
284
285
286
287
288
289mapdata_t::mapdata_t () {
290 int i;
291
292 // reset all the map ptrs to be NULL
293 for (i=0; i<256; i++) {
294 ptrs[i] = (unsigned short *)NULL;
295 }
296
297 // say nothing has been loaded
298 loaded = false;
299}
300
301
302mapconvert::mapconvert () {
303 absentc = 0;
304}
305
306// setmapfile will cause loadmapfile to be called when conversion is
307// needed
308bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
309 unsigned short theabsentc) {
310 // check to see if the mapfile has been already loaded
311 if (mapdata.loaded && gsdlhome == thegsdlhome &&
312 encoding == theencoding && absentc == theabsentc)
313 return true;
314
315 unloadmapfile ();
316 gsdlhome = thegsdlhome;
317 encoding = theencoding;
318 absentc = theabsentc;
319
320 return true;
321}
322
323
324
325// loadmapfile should be called before any conversion is done
326bool mapconvert::loadmapfile (const text_t &thegsdlhome,
327 const text_t &theencoding,
328 unsigned short theabsentc) {
329 FILE *mapfilein = (FILE *)NULL;
330
331 // check to see if the mapfile has been already loaded
332 if (mapdata.loaded && gsdlhome == thegsdlhome &&
333 encoding == theencoding && absentc == theabsentc)
334 return true;
335
336 unloadmapfile ();
337 gsdlhome = thegsdlhome;
338 encoding = theencoding;
339 absentc = theabsentc;
340
341 // open the map file
342 text_t filename = filename_cat (gsdlhome, "unicode");
343 filename = filename_cat (filename, encoding);
344 filename += ".ump";
345 char *cfilename = filename.getcstr();
346 if (cfilename == (char *)NULL) return false;
347 mapfilein = fopen(cfilename, "rb");
348 delete cfilename;
349
350 if (mapfilein == (FILE *)NULL) return false;
351
352 unsigned char c, n1, n2;
353 unsigned short *arrptr;
354 int i;
355 c = fgetc (mapfilein);
356 while (!feof (mapfilein)) {
357 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
358 // allocate a new array
359 arrptr = new unsigned short[256];
360 mapdata.ptrs[c] = arrptr;
361 } else arrptr = mapdata.ptrs[c];
362
363 // clear the array
364 for (i=0; i<256; i++) arrptr[i] = 0;
365
366 // read in this block
367 n1 = fgetc (mapfilein);
368 n2 = fgetc (mapfilein);
369 i=0;
370 while (!feof (mapfilein)) {
371 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
372
373 i++;
374 if (i >= 256) break;
375 n1 = fgetc (mapfilein);
376 n2 = fgetc (mapfilein);
377 }
378
379 c = fgetc (mapfilein);
380 }
381
382 mapdata.loaded = true;
383
384 return true;
385}
386
387void mapconvert::unloadmapfile () {
388 if (!mapdata.loaded) return;
389
390 int i;
391 for (i=0; i<256; i++) {
392 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
393 delete [] mapdata.ptrs[i];
394 mapdata.ptrs[i] = (unsigned short *)NULL;
395 }
396 }
397
398 mapdata.loaded = false;
399}
400
401
402unsigned short mapconvert::convert (unsigned short c) {
403 if (!mapdata.loaded) {
404 if (!gsdlhome.empty() && !encoding.empty() &&
405 loadmapfile (gsdlhome, encoding, absentc)) {
406 // do nothing, successfully loaded database
407 } else return absentc;
408 }
409
410 if (c == 0) return 0; // 0 always maps to 0...
411
412 unsigned short n1 = c >> 8;
413 unsigned short n2 = c & 0xff;
414
415 unsigned short *arrptr = mapdata.ptrs[n1];
416 if (arrptr == (unsigned short *)NULL) return absentc;
417
418 if (arrptr[n2] == 0) return absentc;
419 return arrptr[n2];
420}
421
422text_t mapconvert::convert (const text_t &instr) {
423 if (!mapdata.loaded) return absentc;
424
425 text_t outstr;
426 text_t::const_iterator here = instr.begin();
427 text_t::const_iterator end = instr.end();
428
429 while (here != end) {
430 outstr.push_back(this->convert(*here));
431 here++;
432 }
433
434 return outstr;
435}
436
437
438
439
440mapinconvertclass::mapinconvertclass () {
441 mapbuflen = 0;
442}
443
444void mapinconvertclass::reset () {
445 start = NULL;
446 len = 0;
447 mapbuflen=0;
448}
449
450void mapinconvertclass::convert (text_t &output, status_t &status) {
451 output.clear();
452
453 if (start == NULL || len == 0) {
454 if (mapbuflen == 0) status = finished;
455 else status = stopped;
456 return;
457 }
458
459 // don't want any funny sign conversions happening
460 unsigned char *here = (unsigned char *)start;
461
462 size_t charlen = getmapcharlen ();
463 while (len > 0) {
464 if (charlen == 0) {
465 // start parsing a new character
466 mapbuflen = 0;
467 mapbuf[mapbuflen++] = *here;
468 ++here;
469 --len;
470 charlen = getmapcharlen ();
471
472 } else if (mapbuflen < charlen) {
473 // assumes charlen is always less than MAXMAPCHARLEN
474 mapbuf[mapbuflen++] = *here;
475 ++here;
476 --len;
477 }
478
479 if (mapbuflen == charlen) {
480 // got a complete character
481 if (charlen == 1) {
482 // ascii character
483 output.push_back (mapbuf[0]);
484
485 } else {
486 // two byte character
487 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
488 (unsigned short)mapbuf[1]));
489 }
490
491 mapbuflen = 0;
492 charlen = 0;
493 }
494 }
495
496 start = (char *)here; // save current position
497
498 if (mapbuflen == 0) status = finished;
499 else status = stopped;
500}
501
502
503
504mapoutconvertclass::mapoutconvertclass () {
505 mapbuflen=0;
506 mapbufhere=0;
507}
508
509void mapoutconvertclass::reset () {
510 input = NULL;
511 outs = NULL;
512 mapbuflen = 0;
513 mapbufhere = 0;
514}
515
516// note that convert does not null-terminate the
517// output array of characters
518void mapoutconvertclass::convert (char *output, size_t maxlen,
519 size_t &len, status_t &status) {
520 unsigned short outc;
521
522 if (input == NULL || output == NULL) {
523 if (mapbuflen == 0) status = finished;
524 else status = unfinished;
525 return;
526 }
527
528 // don't want any funny sign conversions happening
529 unsigned char *uoutput = (unsigned char *)output;
530 text_t::iterator textend = input->end();
531 len = 0;
532 while (len < maxlen) {
533 // empty the contents of the internal buffer
534 if (mapbuflen > 0) {
535 while (len < maxlen && mapbufhere < mapbuflen) {
536 *uoutput = mapbuf[mapbufhere];
537 uoutput++;
538 len++;
539 mapbufhere++;
540 }
541
542 if (mapbufhere == mapbuflen) {
543 mapbufhere = 0;
544 mapbuflen = 0;
545 }
546 }
547
548 // fill up the buffer with the next character
549 if (mapbuflen == 0) {
550 if (texthere == textend) break; // finished!
551 if (!rzws || (*texthere != 0x200b)) {
552 if (*texthere < 0x80) {
553 mapbuf[0] = (unsigned char)*texthere;
554 mapbuflen = 1;
555 } else {
556 outc = converter.convert (*texthere);
557 mapbuf[0] = (unsigned char)(outc >> 8);
558 mapbuf[1] = (unsigned char)(outc & 0xff);
559 mapbuflen = 2;
560 }
561 }
562
563 texthere++;
564 mapbufhere = 0;
565 }
566 }
567
568 if (texthere == textend && mapbuflen == 0) status = finished;
569 else status = unfinished;
570}
571
572
573bool simplemapconvert::loadmapfile (bool in) {
574 if (loaded) return true;
575 if (mapfile.empty()) return false;
576
577 char *cfilename = mapfile.getcstr();
578#ifdef GSDL_USE_IOS_H
579 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
580#else
581 ifstream mapfilein (cfilename, ios::in);
582#endif
583 delete cfilename;
584 if (!mapfilein) return false;
585
586 char cline[2048];
587 text_t line;
588
589 while (!mapfilein.eof()) {
590 mapfilein.getline (cline, 2048);
591 line.clear();
592 line.appendcstr (cline);
593 if (line.empty()) continue;
594 // remove comments
595 text_t::iterator end = line.end();
596 text_t::iterator here = findchar (line.begin(), end, '#');
597 if (here != end) {
598 line.erase (here, end);
599 if (line.empty()) continue;
600 }
601
602 text_tarray parts;
603 splitchar (line.begin(), line.end(), '\t', parts);
604
605 // do some simple sanity checks
606 if (parts.size() < 2) continue;
607 text_t::iterator begin1 = parts[0].begin();
608 text_t::iterator begin2 = parts[1].begin();
609 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
610 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
611 char *from = parts[0].getcstr();
612 char *to = parts[1].getcstr();
613 unsigned int f = 0, t = 0;
614 sscanf (from, "%i", &f);
615 sscanf (to, "%i", &t);
616 delete from;
617 delete to;
618
619 if (in) mapping[(unsigned short)f] = (unsigned short)t;
620 else mapping[(unsigned short)t] = (unsigned short)f;
621 }
622
623 loaded = true;
624 return true;
625}
626
627unsigned short simplemapconvert::convert (unsigned short c, bool in) {
628
629 if (!loaded)
630 if (!loadmapfile(in)) return absentc;
631
632 return mapping[c];
633}
634
635
636void simplemapinconvertclass::convert (text_t &output, status_t &status) {
637 output.clear();
638
639 if (start == NULL || len == 0) {
640 status = finished;
641 return;
642 }
643
644 // don't want any funny sign conversions happening
645 unsigned char *here = (unsigned char *)start;
646 while (len > 0) {
647
648 if (*here < 0x80)
649 output.push_back (*here); // append this character
650 else
651 output.push_back (converter.convert(*here, true));
652
653 ++here;
654 --len;
655 }
656
657 start = (char *)here; // save current position
658 status = finished;
659}
660
661
662void simplemapoutconvertclass::convert (char *output, size_t maxlen,
663 size_t &len, status_t &status) {
664
665 if (input == NULL || output == NULL) {
666 status = finished;
667 return;
668 }
669
670 // don't want any funny sign conversions happening
671 unsigned char *uoutput = (unsigned char *)output;
672 text_t::iterator textend = input->end();
673 len = 0;
674 while ((len < maxlen) && (texthere != textend)) {
675
676 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
677 else *uoutput = converter.convert (*texthere, false);
678
679 ++uoutput;
680 ++len;
681 ++texthere;
682 }
683
684 if (texthere == textend) status = finished;
685 else status = unfinished;
686}
Note: See TracBrowser for help on using the repository browser.