source: branches/New_Config_Format-branch/gsdl/lib/gsdlunicode.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.2 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.cpp 1279 2000-07-12 22:21:53Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.12.2.1 2000/07/12 22:20:55 sjboddie
31 merged changes to trunk into New_Config_Format branch
32
33 Revision 1.14 2000/06/23 05:03:29 nzdl
34 fixed a couple of compiler warnings created by the new encoding stuff
35
36 Revision 1.13 2000/06/23 03:21:38 sjboddie
37 Created converter classes for simple 8 bit encodings that use a
38 simple textual map file. Instances of these classes are used to handle
39 the Windows 1256 (Arabic) encoding.
40
41 Revision 1.12 2000/04/06 19:58:02 cs025
42 Correcting a correction - reinstated all lib files due to silly
43 CVS confusion.
44
45 Revision 1.10 1999/09/07 04:57:43 sjboddie
46 added gpl notice
47
48 Revision 1.9 1999/07/21 07:23:17 rjmcnab
49 Added setmapfile function to map conversion utilities so the map file
50 does not need to be loaded when map conversion object is created.
51
52 Revision 1.8 1999/07/01 04:03:45 rjmcnab
53 Optimised utf8inconvertclass::convert slightly.
54
55 Revision 1.7 1999/06/30 04:59:03 rjmcnab
56 Added a to_utf8 function that takes iterators as input.
57
58 Revision 1.6 1999/06/26 01:05:04 rjmcnab
59 No real changes.
60
61 Revision 1.5 1999/01/12 01:50:59 rjmcnab
62
63 Standard header.
64
65 Revision 1.4 1999/01/08 02:33:15 rjmcnab
66
67 Added standard header to source files.
68
69 */
70
71
72#include "gsdlunicode.h"
73
74
75// unitool is currently in mg, if mg is not being used it should
76// be moved into GSDLHOME/lib
77#include "unitool.h"
78
79#include "fileutil.h"
80
81#include <stdio.h>
82
83#if defined(GSDL_USE_OBJECTSPACE)
84# include <ospace\std\iostream>
85# include <ospace\std\fstream>
86#elif defined(GSDL_USE_IOS_H)
87# include <iostream.h>
88# include <fstream.h>
89#else
90# include <iostream>
91# include <fstream>
92#endif
93
94
95// converts a unicode encode text_t string to a utf-8
96// encoded text_t string
97text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
98 text_t out;
99
100 unsigned char thischar[MAXUTF8CHARLEN];
101 int i, charlen;
102
103 while (here != end) {
104 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
105 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
106 here++;
107 }
108
109 return out;
110}
111
112// converts a utf-8 encoded text_t string to a unicode
113// encoded text_t string
114text_t to_uni (const text_t &in) {
115 text_t out;
116 unsigned char *in_cstr = (unsigned char *)in.getcstr();
117 unsigned char *here = in_cstr;
118 unsigned char *end = in_cstr;
119
120 unsigned short unichar;
121 int charlen = 0;
122
123 // get the last valid character in the string
124 while (*end != '\0') end++;
125 end--;
126
127 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
128 out.push_back(unichar);
129 here += charlen;
130 }
131
132 delete in_cstr;
133
134 return out;
135}
136
137
138
139utf8inconvertclass::utf8inconvertclass () {
140 utf8buflen = 0;
141}
142
143void utf8inconvertclass::reset () {
144 start = NULL;
145 len = 0;
146 utf8buflen=0;
147}
148
149void utf8inconvertclass::convert (text_t &output, status_t &status) {
150 output.clear();
151 output.reserve (len/3);
152
153 if (start == NULL || len == 0) {
154 if (utf8buflen == 0) status = finished;
155 else status = stopped;
156 return;
157 }
158
159 // don't want any funny sign conversions happening
160 unsigned char *here = (unsigned char *)start;
161 unsigned char *end = here+len-1;
162 unsigned short c;
163 size_t realcharlen;
164
165 size_t charlen = getutf8charlen ();
166 while (len > 0) {
167 if (charlen == 0) {
168 // start parsing a new character
169 utf8buflen = 0;
170
171 // fast common case
172 while (len > 3) {
173 realcharlen = parse_utf8_char (here, end, &c);
174 output.push_back (c);
175 here += realcharlen;
176 len -= realcharlen;
177 }
178
179 utf8buf[utf8buflen++] = *here;
180 ++here;
181 --len;
182 charlen = getutf8charlen ();
183
184 } else if (utf8buflen < charlen) {
185 // assumes charlen is always less than MAXUTF8CHARLEN
186 utf8buf[utf8buflen++] = *here;
187 ++here;
188 --len;
189 }
190
191 if (utf8buflen == charlen) {
192 // got a complete character
193 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
194 output.push_back (c);
195
196 // move any unparsed characters. If an error occurred some of
197 // the characters might be unused.
198 int i;
199 int diff = utf8buflen - realcharlen;
200 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
201 utf8buflen = diff;
202 charlen = getutf8charlen ();
203 }
204 }
205
206 start = (char *)here; // save current position
207
208 if (utf8buflen == 0) status = finished;
209 else status = stopped;
210}
211
212
213// returns the length that the current contents of the
214// utf8buf should be
215size_t utf8inconvertclass::getutf8charlen () {
216 if (utf8buflen == 0) return 0;
217
218 // one byte character
219 if (utf8buf[0] < 0x80) return 1;
220
221 // error, is not the start of a utf-8 character
222 if (utf8buf[0] < 0xc0) return 1;
223
224 // two bute character
225 if (utf8buf[0] < 0xe0) return 2;
226
227 // three byte character
228 if (utf8buf[0] < 0xf0) return 3;
229
230 // error, character too long for unicode
231 return 1;
232}
233
234
235void utf8outconvertclass::reset () {
236 input = NULL;
237 outs = NULL;
238 utf8buflen = 0;
239 utf8bufhere = 0;
240}
241
242// note that convert does not null-terminate the
243// output array of characters
244void utf8outconvertclass::convert (char *output, size_t maxlen,
245 size_t &len, status_t &status) {
246 if (input == NULL || output == NULL) {
247 if (utf8buflen == 0) status = finished;
248 else status = unfinished;
249 return;
250 }
251
252 // don't want any funny sign conversions happening
253 unsigned char *uoutput = (unsigned char *)output;
254 text_t::iterator textend = input->end();
255 len = 0;
256 while (len < maxlen) {
257 // empty the contents of the internal buffer
258 if (utf8buflen > 0) {
259 while (len < maxlen && utf8bufhere < utf8buflen) {
260 *uoutput = utf8buf[utf8bufhere];
261 uoutput++;
262 len++;
263 utf8bufhere++;
264 }
265
266 if (utf8bufhere == utf8buflen) {
267 utf8bufhere = 0;
268 utf8buflen = 0;
269 }
270 }
271
272 // fill up the buffer with the next character
273 if (utf8buflen == 0) {
274 if (texthere == textend) break; // finished!
275 if (!rzws || (*texthere != 0x200b))
276 utf8buflen = output_utf8_char (*texthere, utf8buf,
277 &utf8buf[MAXUTF8CHARLEN-1]);
278 texthere++;
279 utf8bufhere = 0;
280 }
281 }
282
283 if (texthere == textend && utf8buflen == 0) status = finished;
284 else status = unfinished;
285}
286
287
288
289
290
291
292mapdata_t::mapdata_t () {
293 int i;
294
295 // reset all the map ptrs to be NULL
296 for (i=0; i<256; i++) {
297 ptrs[i] = (unsigned short *)NULL;
298 }
299
300 // say nothing has been loaded
301 loaded = false;
302}
303
304
305mapconvert::mapconvert () {
306 absentc = 0;
307}
308
309// setmapfile will cause loadmapfile to be called when conversion is
310// needed
311bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
312 unsigned short theabsentc) {
313 // check to see if the mapfile has been already loaded
314 if (mapdata.loaded && gsdlhome == thegsdlhome &&
315 encoding == theencoding && absentc == theabsentc)
316 return true;
317
318 unloadmapfile ();
319 gsdlhome = thegsdlhome;
320 encoding = theencoding;
321 absentc = theabsentc;
322
323 return true;
324}
325
326
327
328// loadmapfile should be called before any conversion is done
329bool mapconvert::loadmapfile (const text_t &thegsdlhome,
330 const text_t &theencoding,
331 unsigned short theabsentc) {
332 FILE *mapfilein = (FILE *)NULL;
333
334 // check to see if the mapfile has been already loaded
335 if (mapdata.loaded && gsdlhome == thegsdlhome &&
336 encoding == theencoding && absentc == theabsentc)
337 return true;
338
339 unloadmapfile ();
340 gsdlhome = thegsdlhome;
341 encoding = theencoding;
342 absentc = theabsentc;
343
344 // open the map file
345 text_t filename = filename_cat (gsdlhome, "unicode");
346 filename = filename_cat (filename, encoding);
347 filename += ".ump";
348 char *cfilename = filename.getcstr();
349 if (cfilename == (char *)NULL) return false;
350 mapfilein = fopen(cfilename, "rb");
351 delete cfilename;
352
353 if (mapfilein == (FILE *)NULL) return false;
354
355 unsigned char c, n1, n2;
356 unsigned short *arrptr;
357 int i;
358 c = fgetc (mapfilein);
359 while (!feof (mapfilein)) {
360 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
361 // allocate a new array
362 arrptr = new unsigned short[256];
363 mapdata.ptrs[c] = arrptr;
364 } else arrptr = mapdata.ptrs[c];
365
366 // clear the array
367 for (i=0; i<256; i++) arrptr[i] = 0;
368
369 // read in this block
370 n1 = fgetc (mapfilein);
371 n2 = fgetc (mapfilein);
372 i=0;
373 while (!feof (mapfilein)) {
374 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
375
376 i++;
377 if (i >= 256) break;
378 n1 = fgetc (mapfilein);
379 n2 = fgetc (mapfilein);
380 }
381
382 c = fgetc (mapfilein);
383 }
384
385 mapdata.loaded = true;
386
387 return true;
388}
389
390void mapconvert::unloadmapfile () {
391 if (!mapdata.loaded) return;
392
393 int i;
394 for (i=0; i<256; i++) {
395 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
396 delete [] mapdata.ptrs[i];
397 mapdata.ptrs[i] = (unsigned short *)NULL;
398 }
399 }
400
401 mapdata.loaded = false;
402}
403
404
405unsigned short mapconvert::convert (unsigned short c) {
406 if (!mapdata.loaded) {
407 if (!gsdlhome.empty() && !encoding.empty() &&
408 loadmapfile (gsdlhome, encoding, absentc)) {
409 // do nothing, successfully loaded database
410 } else return absentc;
411 }
412
413 if (c == 0) return 0; // 0 always maps to 0...
414
415 unsigned short n1 = c >> 8;
416 unsigned short n2 = c & 0xff;
417
418 unsigned short *arrptr = mapdata.ptrs[n1];
419 if (arrptr == (unsigned short *)NULL) return absentc;
420
421 if (arrptr[n2] == 0) return absentc;
422 return arrptr[n2];
423}
424
425text_t mapconvert::convert (const text_t &instr) {
426 if (!mapdata.loaded) return absentc;
427
428 text_t outstr;
429 text_t::const_iterator here = instr.begin();
430 text_t::const_iterator end = instr.end();
431
432 while (here != end) {
433 outstr.push_back(this->convert(*here));
434 here++;
435 }
436
437 return outstr;
438}
439
440
441
442
443mapinconvertclass::mapinconvertclass () {
444 mapbuflen = 0;
445}
446
447void mapinconvertclass::reset () {
448 start = NULL;
449 len = 0;
450 mapbuflen=0;
451}
452
453void mapinconvertclass::convert (text_t &output, status_t &status) {
454 output.clear();
455
456 if (start == NULL || len == 0) {
457 if (mapbuflen == 0) status = finished;
458 else status = stopped;
459 return;
460 }
461
462 // don't want any funny sign conversions happening
463 unsigned char *here = (unsigned char *)start;
464
465 size_t charlen = getmapcharlen ();
466 while (len > 0) {
467 if (charlen == 0) {
468 // start parsing a new character
469 mapbuflen = 0;
470 mapbuf[mapbuflen++] = *here;
471 ++here;
472 --len;
473 charlen = getmapcharlen ();
474
475 } else if (mapbuflen < charlen) {
476 // assumes charlen is always less than MAXMAPCHARLEN
477 mapbuf[mapbuflen++] = *here;
478 ++here;
479 --len;
480 }
481
482 if (mapbuflen == charlen) {
483 // got a complete character
484 if (charlen == 1) {
485 // ascii character
486 output.push_back (mapbuf[0]);
487
488 } else {
489 // two byte character
490 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
491 (unsigned short)mapbuf[1]));
492 }
493
494 mapbuflen = 0;
495 charlen = 0;
496 }
497 }
498
499 start = (char *)here; // save current position
500
501 if (mapbuflen == 0) status = finished;
502 else status = stopped;
503}
504
505
506
507mapoutconvertclass::mapoutconvertclass () {
508 mapbuflen=0;
509 mapbufhere=0;
510}
511
512void mapoutconvertclass::reset () {
513 input = NULL;
514 outs = NULL;
515 mapbuflen = 0;
516 mapbufhere = 0;
517}
518
519// note that convert does not null-terminate the
520// output array of characters
521void mapoutconvertclass::convert (char *output, size_t maxlen,
522 size_t &len, status_t &status) {
523 unsigned short outc;
524
525 if (input == NULL || output == NULL) {
526 if (mapbuflen == 0) status = finished;
527 else status = unfinished;
528 return;
529 }
530
531 // don't want any funny sign conversions happening
532 unsigned char *uoutput = (unsigned char *)output;
533 text_t::iterator textend = input->end();
534 len = 0;
535 while (len < maxlen) {
536 // empty the contents of the internal buffer
537 if (mapbuflen > 0) {
538 while (len < maxlen && mapbufhere < mapbuflen) {
539 *uoutput = mapbuf[mapbufhere];
540 uoutput++;
541 len++;
542 mapbufhere++;
543 }
544
545 if (mapbufhere == mapbuflen) {
546 mapbufhere = 0;
547 mapbuflen = 0;
548 }
549 }
550
551 // fill up the buffer with the next character
552 if (mapbuflen == 0) {
553 if (texthere == textend) break; // finished!
554 if (!rzws || (*texthere != 0x200b)) {
555 if (*texthere < 0x80) {
556 mapbuf[0] = (unsigned char)*texthere;
557 mapbuflen = 1;
558 } else {
559 outc = converter.convert (*texthere);
560 mapbuf[0] = (unsigned char)(outc >> 8);
561 mapbuf[1] = (unsigned char)(outc & 0xff);
562 mapbuflen = 2;
563 }
564 }
565
566 texthere++;
567 mapbufhere = 0;
568 }
569 }
570
571 if (texthere == textend && mapbuflen == 0) status = finished;
572 else status = unfinished;
573}
574
575
576bool simplemapconvert::loadmapfile (bool in) {
577 if (loaded) return true;
578 if (mapfile.empty()) return false;
579
580 char *cfilename = mapfile.getcstr();
581#ifdef GSDL_USE_IOS_H
582 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
583#else
584 ifstream mapfilein (cfilename, ios::in);
585#endif
586 delete cfilename;
587 if (!mapfilein) return false;
588
589 char cline[2048];
590 text_t line;
591
592 while (!mapfilein.eof()) {
593 mapfilein.getline (cline, 2048);
594 line.clear();
595 line.appendcstr (cline);
596 if (line.empty()) continue;
597 // remove comments
598 text_t::iterator end = line.end();
599 text_t::iterator here = findchar (line.begin(), end, '#');
600 if (here != end) {
601 line.erase (here, end);
602 if (line.empty()) continue;
603 }
604
605 text_tarray parts;
606 splitchar (line.begin(), line.end(), '\t', parts);
607
608 // do some simple sanity checks
609 if (parts.size() < 2) continue;
610 text_t::iterator begin1 = parts[0].begin();
611 text_t::iterator begin2 = parts[1].begin();
612 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
613 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
614 char *from = parts[0].getcstr();
615 char *to = parts[1].getcstr();
616 unsigned int f = 0, t = 0;
617 sscanf (from, "%i", &f);
618 sscanf (to, "%i", &t);
619 delete from;
620 delete to;
621
622 if (in) mapping[(unsigned short)f] = (unsigned short)t;
623 else mapping[(unsigned short)t] = (unsigned short)f;
624 }
625
626 loaded = true;
627 return true;
628}
629
630unsigned short simplemapconvert::convert (unsigned short c, bool in) {
631
632 if (!loaded)
633 if (!loadmapfile(in)) return absentc;
634
635 return mapping[c];
636}
637
638
639void simplemapinconvertclass::convert (text_t &output, status_t &status) {
640 output.clear();
641
642 if (start == NULL || len == 0) {
643 status = finished;
644 return;
645 }
646
647 // don't want any funny sign conversions happening
648 unsigned char *here = (unsigned char *)start;
649 while (len > 0) {
650
651 if (*here < 0x80)
652 output.push_back (*here); // append this character
653 else
654 output.push_back (converter.convert(*here, true));
655
656 ++here;
657 --len;
658 }
659
660 start = (char *)here; // save current position
661 status = finished;
662}
663
664
665void simplemapoutconvertclass::convert (char *output, size_t maxlen,
666 size_t &len, status_t &status) {
667
668 if (input == NULL || output == NULL) {
669 status = finished;
670 return;
671 }
672
673 // don't want any funny sign conversions happening
674 unsigned char *uoutput = (unsigned char *)output;
675 text_t::iterator textend = input->end();
676 len = 0;
677 while ((len < maxlen) && (texthere != textend)) {
678
679 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
680 else *uoutput = converter.convert (*texthere, false);
681
682 ++uoutput;
683 ++len;
684 ++texthere;
685 }
686
687 if (texthere == textend) status = finished;
688 else status = unfinished;
689}
Note: See TracBrowser for help on using the repository browser.