source: trunk/gsdl/lib/gsdlunicode.cpp@ 100

Last change on this file since 100 was 100, checked in by rjmcnab, 25 years ago

Added standard header to source files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: gsdlunicode.cpp 100 1999-01-08 02:33:16Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.4 1999/01/08 02:33:15 rjmcnab
15
16 Added standard header to source files.
17
18 */
19
20static char *RCSID = "$Id: gsdlunicode.cpp 100 1999-01-08 02:33:16Z rjmcnab $";
21
22
23#include "gsdlunicode.h"
24
25
26// unitool is currently in mg, if mg is not being used it should
27// be moved into GSDLHOME/lib
28#include "unitool.h"
29
30#include "fileutil.h"
31
32#include <stdio.h>
33
34
35
36
37
38
39
40
41// converts a unicode encode text_t string to a utf-8
42// encoded text_t string
43text_t to_utf8 (const text_t &in) {
44 text_t::const_iterator here = in.begin();
45 text_t::const_iterator end = in.end();
46 text_t out;
47
48 unsigned char thischar[MAXUTF8CHARLEN];
49 int i, charlen;
50
51 while (here != end) {
52 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
53 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
54 here++;
55 }
56
57 return out;
58}
59
60// converts a utf-8 encoded text_t string to a unicode
61// encoded text_t string
62text_t to_uni (const text_t &in) {
63 text_t out;
64 unsigned char *in_cstr = (unsigned char *)in.getcstr();
65 unsigned char *here = in_cstr;
66 unsigned char *end = in_cstr;
67
68 unsigned short unichar;
69 int charlen = 0;
70
71 // get the last valid character in the string
72 while (*end != '\0') end++;
73 end--;
74
75 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
76 out.push_back(unichar);
77 here += charlen;
78 }
79
80 delete in_cstr;
81
82 return out;
83}
84
85
86
87utf8inconvertclass::utf8inconvertclass () {
88 utf8buflen = 0;
89}
90
91void utf8inconvertclass::reset () {
92 start = NULL;
93 len = 0;
94 utf8buflen=0;
95}
96
97void utf8inconvertclass::convert (text_t &output, status_t &status) {
98 output.clear();
99
100 if (start == NULL || len == 0) {
101 if (utf8buflen == 0) status = finished;
102 else status = stopped;
103 return;
104 }
105
106 // don't want any funny sign conversions happening
107 unsigned char *here = (unsigned char *)start;
108
109 size_t charlen = getutf8charlen ();
110 unsigned short c;
111 size_t realcharlen;
112 while (len > 0) {
113 if (charlen == 0) {
114 // start parsing a new character
115 utf8buflen = 0;
116 utf8buf[utf8buflen++] = *here;
117 ++here;
118 --len;
119 charlen = getutf8charlen ();
120
121 } else if (utf8buflen < charlen) {
122 // assumes charlen is always less than MAXUTF8CHARLEN
123 utf8buf[utf8buflen++] = *here;
124 ++here;
125 --len;
126 }
127
128 if (utf8buflen == charlen) {
129 // got a complete character
130 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
131 output.push_back (c);
132
133 // move any unparsed characters. If an error occurred some of
134 // the characters might be unused.
135 int i;
136 int diff = utf8buflen - realcharlen;
137 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
138 utf8buflen = diff;
139 charlen = getutf8charlen ();
140 }
141 }
142
143 start = (char *)here; // save current position
144
145 if (utf8buflen == 0) status = finished;
146 else status = stopped;
147}
148
149
150// returns the length that the current contents of the
151// utf8buf should be
152size_t utf8inconvertclass::getutf8charlen () {
153 if (utf8buflen == 0) return 0;
154
155 // one byte character
156 if (utf8buf[0] < 0x80) return 1;
157
158 // error, is not the start of a utf-8 character
159 if (utf8buf[0] < 0xc0) return 1;
160
161 // two bute character
162 if (utf8buf[0] < 0xe0) return 2;
163
164 // three byte character
165 if (utf8buf[0] < 0xf0) return 3;
166
167 // error, character too long for unicode
168 return 1;
169}
170
171
172void utf8outconvertclass::reset () {
173 input = NULL;
174 outs = NULL;
175 utf8buflen = 0;
176 utf8bufhere = 0;
177}
178
179// note that convert does not null-terminate the
180// output array of characters
181void utf8outconvertclass::convert (char *output, size_t maxlen,
182 size_t &len, status_t &status) {
183 if (input == NULL || output == NULL) {
184 if (utf8buflen == 0) status = finished;
185 else status = unfinished;
186 return;
187 }
188
189 // don't want any funny sign conversions happening
190 unsigned char *uoutput = (unsigned char *)output;
191 text_t::iterator textend = input->end();
192 len = 0;
193 while (len < maxlen) {
194 // empty the contents of the internal buffer
195 if (utf8buflen > 0) {
196 while (len < maxlen && utf8bufhere < utf8buflen) {
197 *uoutput = utf8buf[utf8bufhere];
198 uoutput++;
199 len++;
200 utf8bufhere++;
201 }
202
203 if (utf8bufhere == utf8buflen) {
204 utf8bufhere = 0;
205 utf8buflen = 0;
206 }
207 }
208
209 // fill up the buffer with the next character
210 if (utf8buflen == 0) {
211 if (texthere == textend) break; // finished!
212 if (!rzws || (*texthere != 0x200b))
213 utf8buflen = output_utf8_char (*texthere, utf8buf,
214 &utf8buf[MAXUTF8CHARLEN-1]);
215 texthere++;
216 utf8bufhere = 0;
217 }
218 }
219
220 if (texthere == textend && utf8buflen == 0) status = finished;
221 else status = unfinished;
222}
223
224
225
226
227
228
229mapdata_t::mapdata_t () {
230 int i;
231
232 // reset all the map ptrs to be NULL
233 for (i=0; i<256; i++) {
234 ptrs[i] = (unsigned short *)NULL;
235 }
236
237 // say nothing has been loaded
238 loaded = false;
239}
240
241
242mapconvert::mapconvert () {
243 absentc = 0;
244}
245
246
247// loadmapfile should be called before any conversion is done
248bool mapconvert::loadmapfile (const text_t &thegsdlhome,
249 const text_t &theencoding,
250 unsigned short theabsentc) {
251 FILE *mapfilein = (FILE *)NULL;
252
253 // check to see if the mapfile has been already loaded
254 if (mapdata.loaded && gsdlhome == thegsdlhome &&
255 encoding == theencoding && absentc == theabsentc)
256 return true;
257
258 unloadmapfile ();
259 gsdlhome = thegsdlhome;
260 encoding = theencoding;
261 absentc = theabsentc;
262
263 // open the map file
264 text_t filename = filename_cat (gsdlhome, "unicode");
265 filename = filename_cat (filename, encoding);
266 filename += ".ump";
267 char *cfilename = filename.getcstr();
268 if (cfilename == (char *)NULL) return false;
269 mapfilein = fopen(cfilename, "rb");
270 delete cfilename;
271
272 if (mapfilein == (FILE *)NULL) return false;
273
274 unsigned char c, n1, n2;
275 unsigned short *arrptr;
276 int i;
277 c = fgetc (mapfilein);
278 while (!feof (mapfilein)) {
279 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
280 // allocate a new array
281 arrptr = new unsigned short[256];
282 mapdata.ptrs[c] = arrptr;
283 } else arrptr = mapdata.ptrs[c];
284
285 // clear the array
286 for (i=0; i<256; i++) arrptr[i] = 0;
287
288 // read in this block
289 n1 = fgetc (mapfilein);
290 n2 = fgetc (mapfilein);
291 i=0;
292 while (!feof (mapfilein)) {
293 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
294
295 i++;
296 if (i >= 256) break;
297 n1 = fgetc (mapfilein);
298 n2 = fgetc (mapfilein);
299 }
300
301 c = fgetc (mapfilein);
302 }
303
304 mapdata.loaded = true;
305
306 return true;
307}
308
309void mapconvert::unloadmapfile () {
310 if (!mapdata.loaded) return;
311
312 int i;
313 for (i=0; i<256; i++) {
314 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
315 delete [] mapdata.ptrs[i];
316 mapdata.ptrs[i] = (unsigned short *)NULL;
317 }
318 }
319
320 mapdata.loaded = false;
321}
322
323
324unsigned short mapconvert::convert (unsigned short c) {
325 if (!mapdata.loaded) return absentc;
326
327 if (c == 0) return 0; // 0 always maps to 0...
328
329 unsigned short n1 = c >> 8;
330 unsigned short n2 = c & 0xff;
331
332 unsigned short *arrptr = mapdata.ptrs[n1];
333 if (arrptr == (unsigned short *)NULL) return absentc;
334
335 if (arrptr[n2] == 0) return absentc;
336 return arrptr[n2];
337}
338
339text_t mapconvert::convert (const text_t &instr) {
340 if (!mapdata.loaded) return absentc;
341
342 text_t outstr;
343 text_t::const_iterator here = instr.begin();
344 text_t::const_iterator end = instr.end();
345
346 while (here != end) {
347 outstr.push_back(this->convert(*here));
348 here++;
349 }
350
351 return outstr;
352}
353
354
355
356
357mapinconvertclass::mapinconvertclass () {
358 mapbuflen = 0;
359}
360
361void mapinconvertclass::reset () {
362 start = NULL;
363 len = 0;
364 mapbuflen=0;
365}
366
367void mapinconvertclass::convert (text_t &output, status_t &status) {
368 output.clear();
369
370 if (start == NULL || len == 0) {
371 if (mapbuflen == 0) status = finished;
372 else status = stopped;
373 return;
374 }
375
376 // don't want any funny sign conversions happening
377 unsigned char *here = (unsigned char *)start;
378
379 size_t charlen = getmapcharlen ();
380 unsigned short c;
381 size_t realcharlen;
382 while (len > 0) {
383 if (charlen == 0) {
384 // start parsing a new character
385 mapbuflen = 0;
386 mapbuf[mapbuflen++] = *here;
387 ++here;
388 --len;
389 charlen = getmapcharlen ();
390
391 } else if (mapbuflen < charlen) {
392 // assumes charlen is always less than MAXMAPCHARLEN
393 mapbuf[mapbuflen++] = *here;
394 ++here;
395 --len;
396 }
397
398 if (mapbuflen == charlen) {
399 // got a complete character
400 if (charlen == 1) {
401 // ascii character
402 output.push_back (mapbuf[0]);
403
404 } else {
405 // two byte character
406 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
407 (unsigned short)mapbuf[1]));
408 }
409
410 mapbuflen = 0;
411 charlen = 0;
412 }
413 }
414
415 start = (char *)here; // save current position
416
417 if (mapbuflen == 0) status = finished;
418 else status = stopped;
419}
420
421
422
423mapoutconvertclass::mapoutconvertclass () {
424 mapbuflen=0;
425 mapbufhere=0;
426}
427
428void mapoutconvertclass::reset () {
429 input = NULL;
430 outs = NULL;
431 mapbuflen = 0;
432 mapbufhere = 0;
433}
434
435// note that convert does not null-terminate the
436// output array of characters
437void mapoutconvertclass::convert (char *output, size_t maxlen,
438 size_t &len, status_t &status) {
439 unsigned short outc;
440
441 if (input == NULL || output == NULL) {
442 if (mapbuflen == 0) status = finished;
443 else status = unfinished;
444 return;
445 }
446
447 // don't want any funny sign conversions happening
448 unsigned char *uoutput = (unsigned char *)output;
449 text_t::iterator textend = input->end();
450 len = 0;
451 while (len < maxlen) {
452 // empty the contents of the internal buffer
453 if (mapbuflen > 0) {
454 while (len < maxlen && mapbufhere < mapbuflen) {
455 *uoutput = mapbuf[mapbufhere];
456 uoutput++;
457 len++;
458 mapbufhere++;
459 }
460
461 if (mapbufhere == mapbuflen) {
462 mapbufhere = 0;
463 mapbuflen = 0;
464 }
465 }
466
467 // fill up the buffer with the next character
468 if (mapbuflen == 0) {
469 if (texthere == textend) break; // finished!
470 if (!rzws || (*texthere != 0x200b)) {
471 if (*texthere < 0x80) {
472 mapbuf[0] = (unsigned char)*texthere;
473 mapbuflen = 1;
474 } else {
475 outc = converter.convert (*texthere);
476 mapbuf[0] = (unsigned char)(outc >> 8);
477 mapbuf[1] = (unsigned char)(outc & 0xff);
478 mapbuflen = 2;
479 }
480 }
481
482 texthere++;
483 mapbufhere = 0;
484 }
485 }
486
487 if (texthere == textend && mapbuflen == 0) status = finished;
488 else status = unfinished;
489}
Note: See TracBrowser for help on using the repository browser.