source: trunk/gsdl/lib/gsdlunicode.cpp@ 1076

Last change on this file since 1076 was 1076, checked in by cs025, 24 years ago

Correcting a correction - reinstated all lib files due to silly
CVS confusion.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.cpp 1076 2000-04-06 19:58:04Z cs025 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.12 2000/04/06 19:58:02 cs025
31 Correcting a correction - reinstated all lib files due to silly
32 CVS confusion.
33
34 Revision 1.10 1999/09/07 04:57:43 sjboddie
35 added gpl notice
36
37 Revision 1.9 1999/07/21 07:23:17 rjmcnab
38 Added setmapfile function to map conversion utilities so the map file
39 does not need to be loaded when map conversion object is created.
40
41 Revision 1.8 1999/07/01 04:03:45 rjmcnab
42 Optimised utf8inconvertclass::convert slightly.
43
44 Revision 1.7 1999/06/30 04:59:03 rjmcnab
45 Added a to_utf8 function that takes iterators as input.
46
47 Revision 1.6 1999/06/26 01:05:04 rjmcnab
48 No real changes.
49
50 Revision 1.5 1999/01/12 01:50:59 rjmcnab
51
52 Standard header.
53
54 Revision 1.4 1999/01/08 02:33:15 rjmcnab
55
56 Added standard header to source files.
57
58 */
59
60
61#include "gsdlunicode.h"
62
63
64// unitool is currently in mg, if mg is not being used it should
65// be moved into GSDLHOME/lib
66#include "unitool.h"
67
68#include "fileutil.h"
69
70#include <stdio.h>
71
72
73
74// converts a unicode encode text_t string to a utf-8
75// encoded text_t string
76text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
77 text_t out;
78
79 unsigned char thischar[MAXUTF8CHARLEN];
80 int i, charlen;
81
82 while (here != end) {
83 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
84 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
85 here++;
86 }
87
88 return out;
89}
90
91// converts a utf-8 encoded text_t string to a unicode
92// encoded text_t string
93text_t to_uni (const text_t &in) {
94 text_t out;
95 unsigned char *in_cstr = (unsigned char *)in.getcstr();
96 unsigned char *here = in_cstr;
97 unsigned char *end = in_cstr;
98
99 unsigned short unichar;
100 int charlen = 0;
101
102 // get the last valid character in the string
103 while (*end != '\0') end++;
104 end--;
105
106 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
107 out.push_back(unichar);
108 here += charlen;
109 }
110
111 delete in_cstr;
112
113 return out;
114}
115
116
117
118utf8inconvertclass::utf8inconvertclass () {
119 utf8buflen = 0;
120}
121
122void utf8inconvertclass::reset () {
123 start = NULL;
124 len = 0;
125 utf8buflen=0;
126}
127
128void utf8inconvertclass::convert (text_t &output, status_t &status) {
129 output.clear();
130 output.reserve (len/3);
131
132 if (start == NULL || len == 0) {
133 if (utf8buflen == 0) status = finished;
134 else status = stopped;
135 return;
136 }
137
138 // don't want any funny sign conversions happening
139 unsigned char *here = (unsigned char *)start;
140 unsigned char *end = here+len-1;
141 unsigned short c;
142 size_t realcharlen;
143
144 size_t charlen = getutf8charlen ();
145 while (len > 0) {
146 if (charlen == 0) {
147 // start parsing a new character
148 utf8buflen = 0;
149
150 // fast common case
151 while (len > 3) {
152 realcharlen = parse_utf8_char (here, end, &c);
153 output.push_back (c);
154 here += realcharlen;
155 len -= realcharlen;
156 }
157
158 utf8buf[utf8buflen++] = *here;
159 ++here;
160 --len;
161 charlen = getutf8charlen ();
162
163 } else if (utf8buflen < charlen) {
164 // assumes charlen is always less than MAXUTF8CHARLEN
165 utf8buf[utf8buflen++] = *here;
166 ++here;
167 --len;
168 }
169
170 if (utf8buflen == charlen) {
171 // got a complete character
172 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
173 output.push_back (c);
174
175 // move any unparsed characters. If an error occurred some of
176 // the characters might be unused.
177 int i;
178 int diff = utf8buflen - realcharlen;
179 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
180 utf8buflen = diff;
181 charlen = getutf8charlen ();
182 }
183 }
184
185 start = (char *)here; // save current position
186
187 if (utf8buflen == 0) status = finished;
188 else status = stopped;
189}
190
191
192// returns the length that the current contents of the
193// utf8buf should be
194size_t utf8inconvertclass::getutf8charlen () {
195 if (utf8buflen == 0) return 0;
196
197 // one byte character
198 if (utf8buf[0] < 0x80) return 1;
199
200 // error, is not the start of a utf-8 character
201 if (utf8buf[0] < 0xc0) return 1;
202
203 // two bute character
204 if (utf8buf[0] < 0xe0) return 2;
205
206 // three byte character
207 if (utf8buf[0] < 0xf0) return 3;
208
209 // error, character too long for unicode
210 return 1;
211}
212
213
214void utf8outconvertclass::reset () {
215 input = NULL;
216 outs = NULL;
217 utf8buflen = 0;
218 utf8bufhere = 0;
219}
220
221// note that convert does not null-terminate the
222// output array of characters
223void utf8outconvertclass::convert (char *output, size_t maxlen,
224 size_t &len, status_t &status) {
225 if (input == NULL || output == NULL) {
226 if (utf8buflen == 0) status = finished;
227 else status = unfinished;
228 return;
229 }
230
231 // don't want any funny sign conversions happening
232 unsigned char *uoutput = (unsigned char *)output;
233 text_t::iterator textend = input->end();
234 len = 0;
235 while (len < maxlen) {
236 // empty the contents of the internal buffer
237 if (utf8buflen > 0) {
238 while (len < maxlen && utf8bufhere < utf8buflen) {
239 *uoutput = utf8buf[utf8bufhere];
240 uoutput++;
241 len++;
242 utf8bufhere++;
243 }
244
245 if (utf8bufhere == utf8buflen) {
246 utf8bufhere = 0;
247 utf8buflen = 0;
248 }
249 }
250
251 // fill up the buffer with the next character
252 if (utf8buflen == 0) {
253 if (texthere == textend) break; // finished!
254 if (!rzws || (*texthere != 0x200b))
255 utf8buflen = output_utf8_char (*texthere, utf8buf,
256 &utf8buf[MAXUTF8CHARLEN-1]);
257 texthere++;
258 utf8bufhere = 0;
259 }
260 }
261
262 if (texthere == textend && utf8buflen == 0) status = finished;
263 else status = unfinished;
264}
265
266
267
268
269
270
271mapdata_t::mapdata_t () {
272 int i;
273
274 // reset all the map ptrs to be NULL
275 for (i=0; i<256; i++) {
276 ptrs[i] = (unsigned short *)NULL;
277 }
278
279 // say nothing has been loaded
280 loaded = false;
281}
282
283
284mapconvert::mapconvert () {
285 absentc = 0;
286}
287
288// setmapfile will cause loadmapfile to be called when conversion is
289// needed
290bool mapconvert::setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
291 unsigned short theabsentc) {
292 // check to see if the mapfile has been already loaded
293 if (mapdata.loaded && gsdlhome == thegsdlhome &&
294 encoding == theencoding && absentc == theabsentc)
295 return true;
296
297 unloadmapfile ();
298 gsdlhome = thegsdlhome;
299 encoding = theencoding;
300 absentc = theabsentc;
301
302 return true;
303}
304
305
306
307// loadmapfile should be called before any conversion is done
308bool mapconvert::loadmapfile (const text_t &thegsdlhome,
309 const text_t &theencoding,
310 unsigned short theabsentc) {
311 FILE *mapfilein = (FILE *)NULL;
312
313 // check to see if the mapfile has been already loaded
314 if (mapdata.loaded && gsdlhome == thegsdlhome &&
315 encoding == theencoding && absentc == theabsentc)
316 return true;
317
318 unloadmapfile ();
319 gsdlhome = thegsdlhome;
320 encoding = theencoding;
321 absentc = theabsentc;
322
323 // open the map file
324 text_t filename = filename_cat (gsdlhome, "unicode");
325 filename = filename_cat (filename, encoding);
326 filename += ".ump";
327 char *cfilename = filename.getcstr();
328 if (cfilename == (char *)NULL) return false;
329 mapfilein = fopen(cfilename, "rb");
330 delete cfilename;
331
332 if (mapfilein == (FILE *)NULL) return false;
333
334 unsigned char c, n1, n2;
335 unsigned short *arrptr;
336 int i;
337 c = fgetc (mapfilein);
338 while (!feof (mapfilein)) {
339 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
340 // allocate a new array
341 arrptr = new unsigned short[256];
342 mapdata.ptrs[c] = arrptr;
343 } else arrptr = mapdata.ptrs[c];
344
345 // clear the array
346 for (i=0; i<256; i++) arrptr[i] = 0;
347
348 // read in this block
349 n1 = fgetc (mapfilein);
350 n2 = fgetc (mapfilein);
351 i=0;
352 while (!feof (mapfilein)) {
353 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
354
355 i++;
356 if (i >= 256) break;
357 n1 = fgetc (mapfilein);
358 n2 = fgetc (mapfilein);
359 }
360
361 c = fgetc (mapfilein);
362 }
363
364 mapdata.loaded = true;
365
366 return true;
367}
368
369void mapconvert::unloadmapfile () {
370 if (!mapdata.loaded) return;
371
372 int i;
373 for (i=0; i<256; i++) {
374 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
375 delete [] mapdata.ptrs[i];
376 mapdata.ptrs[i] = (unsigned short *)NULL;
377 }
378 }
379
380 mapdata.loaded = false;
381}
382
383
384unsigned short mapconvert::convert (unsigned short c) {
385 if (!mapdata.loaded) {
386 if (!gsdlhome.empty() && !encoding.empty() &&
387 loadmapfile (gsdlhome, encoding, absentc)) {
388 // do nothing, successfully loaded database
389 } else return absentc;
390 }
391
392 if (c == 0) return 0; // 0 always maps to 0...
393
394 unsigned short n1 = c >> 8;
395 unsigned short n2 = c & 0xff;
396
397 unsigned short *arrptr = mapdata.ptrs[n1];
398 if (arrptr == (unsigned short *)NULL) return absentc;
399
400 if (arrptr[n2] == 0) return absentc;
401 return arrptr[n2];
402}
403
404text_t mapconvert::convert (const text_t &instr) {
405 if (!mapdata.loaded) return absentc;
406
407 text_t outstr;
408 text_t::const_iterator here = instr.begin();
409 text_t::const_iterator end = instr.end();
410
411 while (here != end) {
412 outstr.push_back(this->convert(*here));
413 here++;
414 }
415
416 return outstr;
417}
418
419
420
421
422mapinconvertclass::mapinconvertclass () {
423 mapbuflen = 0;
424}
425
426void mapinconvertclass::reset () {
427 start = NULL;
428 len = 0;
429 mapbuflen=0;
430}
431
432void mapinconvertclass::convert (text_t &output, status_t &status) {
433 output.clear();
434
435 if (start == NULL || len == 0) {
436 if (mapbuflen == 0) status = finished;
437 else status = stopped;
438 return;
439 }
440
441 // don't want any funny sign conversions happening
442 unsigned char *here = (unsigned char *)start;
443
444 size_t charlen = getmapcharlen ();
445 while (len > 0) {
446 if (charlen == 0) {
447 // start parsing a new character
448 mapbuflen = 0;
449 mapbuf[mapbuflen++] = *here;
450 ++here;
451 --len;
452 charlen = getmapcharlen ();
453
454 } else if (mapbuflen < charlen) {
455 // assumes charlen is always less than MAXMAPCHARLEN
456 mapbuf[mapbuflen++] = *here;
457 ++here;
458 --len;
459 }
460
461 if (mapbuflen == charlen) {
462 // got a complete character
463 if (charlen == 1) {
464 // ascii character
465 output.push_back (mapbuf[0]);
466
467 } else {
468 // two byte character
469 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
470 (unsigned short)mapbuf[1]));
471 }
472
473 mapbuflen = 0;
474 charlen = 0;
475 }
476 }
477
478 start = (char *)here; // save current position
479
480 if (mapbuflen == 0) status = finished;
481 else status = stopped;
482}
483
484
485
486mapoutconvertclass::mapoutconvertclass () {
487 mapbuflen=0;
488 mapbufhere=0;
489}
490
491void mapoutconvertclass::reset () {
492 input = NULL;
493 outs = NULL;
494 mapbuflen = 0;
495 mapbufhere = 0;
496}
497
498// note that convert does not null-terminate the
499// output array of characters
500void mapoutconvertclass::convert (char *output, size_t maxlen,
501 size_t &len, status_t &status) {
502 unsigned short outc;
503
504 if (input == NULL || output == NULL) {
505 if (mapbuflen == 0) status = finished;
506 else status = unfinished;
507 return;
508 }
509
510 // don't want any funny sign conversions happening
511 unsigned char *uoutput = (unsigned char *)output;
512 text_t::iterator textend = input->end();
513 len = 0;
514 while (len < maxlen) {
515 // empty the contents of the internal buffer
516 if (mapbuflen > 0) {
517 while (len < maxlen && mapbufhere < mapbuflen) {
518 *uoutput = mapbuf[mapbufhere];
519 uoutput++;
520 len++;
521 mapbufhere++;
522 }
523
524 if (mapbufhere == mapbuflen) {
525 mapbufhere = 0;
526 mapbuflen = 0;
527 }
528 }
529
530 // fill up the buffer with the next character
531 if (mapbuflen == 0) {
532 if (texthere == textend) break; // finished!
533 if (!rzws || (*texthere != 0x200b)) {
534 if (*texthere < 0x80) {
535 mapbuf[0] = (unsigned char)*texthere;
536 mapbuflen = 1;
537 } else {
538 outc = converter.convert (*texthere);
539 mapbuf[0] = (unsigned char)(outc >> 8);
540 mapbuf[1] = (unsigned char)(outc & 0xff);
541 mapbuflen = 2;
542 }
543 }
544
545 texthere++;
546 mapbufhere = 0;
547 }
548 }
549
550 if (texthere == textend && mapbuflen == 0) status = finished;
551 else status = unfinished;
552}
Note: See TracBrowser for help on using the repository browser.