source: trunk/gsdl/lib/gsdlunicode.cpp@ 115

Last change on this file since 115 was 111, checked in by rjmcnab, 25 years ago

Standard header.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.4 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: gsdlunicode.cpp 111 1999-01-12 01:51:02Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.5 1999/01/12 01:50:59 rjmcnab
15 Standard header.
16
17 Revision 1.4 1999/01/08 02:33:15 rjmcnab
18
19 Added standard header to source files.
20
21 */
22
23
24#include "gsdlunicode.h"
25
26
27// unitool is currently in mg, if mg is not being used it should
28// be moved into GSDLHOME/lib
29#include "unitool.h"
30
31#include "fileutil.h"
32
33#include <stdio.h>
34
35
36
37// converts a unicode encode text_t string to a utf-8
38// encoded text_t string
39text_t to_utf8 (const text_t &in) {
40 text_t::const_iterator here = in.begin();
41 text_t::const_iterator end = in.end();
42 text_t out;
43
44 unsigned char thischar[MAXUTF8CHARLEN];
45 int i, charlen;
46
47 while (here != end) {
48 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
49 for (i=0; i<charlen; i++) out.push_back(thischar[i]);
50 here++;
51 }
52
53 return out;
54}
55
56// converts a utf-8 encoded text_t string to a unicode
57// encoded text_t string
58text_t to_uni (const text_t &in) {
59 text_t out;
60 unsigned char *in_cstr = (unsigned char *)in.getcstr();
61 unsigned char *here = in_cstr;
62 unsigned char *end = in_cstr;
63
64 unsigned short unichar;
65 int charlen = 0;
66
67 // get the last valid character in the string
68 while (*end != '\0') end++;
69 end--;
70
71 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
72 out.push_back(unichar);
73 here += charlen;
74 }
75
76 delete in_cstr;
77
78 return out;
79}
80
81
82
83utf8inconvertclass::utf8inconvertclass () {
84 utf8buflen = 0;
85}
86
87void utf8inconvertclass::reset () {
88 start = NULL;
89 len = 0;
90 utf8buflen=0;
91}
92
93void utf8inconvertclass::convert (text_t &output, status_t &status) {
94 output.clear();
95
96 if (start == NULL || len == 0) {
97 if (utf8buflen == 0) status = finished;
98 else status = stopped;
99 return;
100 }
101
102 // don't want any funny sign conversions happening
103 unsigned char *here = (unsigned char *)start;
104
105 size_t charlen = getutf8charlen ();
106 unsigned short c;
107 size_t realcharlen;
108 while (len > 0) {
109 if (charlen == 0) {
110 // start parsing a new character
111 utf8buflen = 0;
112 utf8buf[utf8buflen++] = *here;
113 ++here;
114 --len;
115 charlen = getutf8charlen ();
116
117 } else if (utf8buflen < charlen) {
118 // assumes charlen is always less than MAXUTF8CHARLEN
119 utf8buf[utf8buflen++] = *here;
120 ++here;
121 --len;
122 }
123
124 if (utf8buflen == charlen) {
125 // got a complete character
126 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
127 output.push_back (c);
128
129 // move any unparsed characters. If an error occurred some of
130 // the characters might be unused.
131 int i;
132 int diff = utf8buflen - realcharlen;
133 for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
134 utf8buflen = diff;
135 charlen = getutf8charlen ();
136 }
137 }
138
139 start = (char *)here; // save current position
140
141 if (utf8buflen == 0) status = finished;
142 else status = stopped;
143}
144
145
146// returns the length that the current contents of the
147// utf8buf should be
148size_t utf8inconvertclass::getutf8charlen () {
149 if (utf8buflen == 0) return 0;
150
151 // one byte character
152 if (utf8buf[0] < 0x80) return 1;
153
154 // error, is not the start of a utf-8 character
155 if (utf8buf[0] < 0xc0) return 1;
156
157 // two bute character
158 if (utf8buf[0] < 0xe0) return 2;
159
160 // three byte character
161 if (utf8buf[0] < 0xf0) return 3;
162
163 // error, character too long for unicode
164 return 1;
165}
166
167
168void utf8outconvertclass::reset () {
169 input = NULL;
170 outs = NULL;
171 utf8buflen = 0;
172 utf8bufhere = 0;
173}
174
175// note that convert does not null-terminate the
176// output array of characters
177void utf8outconvertclass::convert (char *output, size_t maxlen,
178 size_t &len, status_t &status) {
179 if (input == NULL || output == NULL) {
180 if (utf8buflen == 0) status = finished;
181 else status = unfinished;
182 return;
183 }
184
185 // don't want any funny sign conversions happening
186 unsigned char *uoutput = (unsigned char *)output;
187 text_t::iterator textend = input->end();
188 len = 0;
189 while (len < maxlen) {
190 // empty the contents of the internal buffer
191 if (utf8buflen > 0) {
192 while (len < maxlen && utf8bufhere < utf8buflen) {
193 *uoutput = utf8buf[utf8bufhere];
194 uoutput++;
195 len++;
196 utf8bufhere++;
197 }
198
199 if (utf8bufhere == utf8buflen) {
200 utf8bufhere = 0;
201 utf8buflen = 0;
202 }
203 }
204
205 // fill up the buffer with the next character
206 if (utf8buflen == 0) {
207 if (texthere == textend) break; // finished!
208 if (!rzws || (*texthere != 0x200b))
209 utf8buflen = output_utf8_char (*texthere, utf8buf,
210 &utf8buf[MAXUTF8CHARLEN-1]);
211 texthere++;
212 utf8bufhere = 0;
213 }
214 }
215
216 if (texthere == textend && utf8buflen == 0) status = finished;
217 else status = unfinished;
218}
219
220
221
222
223
224
225mapdata_t::mapdata_t () {
226 int i;
227
228 // reset all the map ptrs to be NULL
229 for (i=0; i<256; i++) {
230 ptrs[i] = (unsigned short *)NULL;
231 }
232
233 // say nothing has been loaded
234 loaded = false;
235}
236
237
238mapconvert::mapconvert () {
239 absentc = 0;
240}
241
242
243// loadmapfile should be called before any conversion is done
244bool mapconvert::loadmapfile (const text_t &thegsdlhome,
245 const text_t &theencoding,
246 unsigned short theabsentc) {
247 FILE *mapfilein = (FILE *)NULL;
248
249 // check to see if the mapfile has been already loaded
250 if (mapdata.loaded && gsdlhome == thegsdlhome &&
251 encoding == theencoding && absentc == theabsentc)
252 return true;
253
254 unloadmapfile ();
255 gsdlhome = thegsdlhome;
256 encoding = theencoding;
257 absentc = theabsentc;
258
259 // open the map file
260 text_t filename = filename_cat (gsdlhome, "unicode");
261 filename = filename_cat (filename, encoding);
262 filename += ".ump";
263 char *cfilename = filename.getcstr();
264 if (cfilename == (char *)NULL) return false;
265 mapfilein = fopen(cfilename, "rb");
266 delete cfilename;
267
268 if (mapfilein == (FILE *)NULL) return false;
269
270 unsigned char c, n1, n2;
271 unsigned short *arrptr;
272 int i;
273 c = fgetc (mapfilein);
274 while (!feof (mapfilein)) {
275 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
276 // allocate a new array
277 arrptr = new unsigned short[256];
278 mapdata.ptrs[c] = arrptr;
279 } else arrptr = mapdata.ptrs[c];
280
281 // clear the array
282 for (i=0; i<256; i++) arrptr[i] = 0;
283
284 // read in this block
285 n1 = fgetc (mapfilein);
286 n2 = fgetc (mapfilein);
287 i=0;
288 while (!feof (mapfilein)) {
289 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
290
291 i++;
292 if (i >= 256) break;
293 n1 = fgetc (mapfilein);
294 n2 = fgetc (mapfilein);
295 }
296
297 c = fgetc (mapfilein);
298 }
299
300 mapdata.loaded = true;
301
302 return true;
303}
304
305void mapconvert::unloadmapfile () {
306 if (!mapdata.loaded) return;
307
308 int i;
309 for (i=0; i<256; i++) {
310 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
311 delete [] mapdata.ptrs[i];
312 mapdata.ptrs[i] = (unsigned short *)NULL;
313 }
314 }
315
316 mapdata.loaded = false;
317}
318
319
320unsigned short mapconvert::convert (unsigned short c) {
321 if (!mapdata.loaded) return absentc;
322
323 if (c == 0) return 0; // 0 always maps to 0...
324
325 unsigned short n1 = c >> 8;
326 unsigned short n2 = c & 0xff;
327
328 unsigned short *arrptr = mapdata.ptrs[n1];
329 if (arrptr == (unsigned short *)NULL) return absentc;
330
331 if (arrptr[n2] == 0) return absentc;
332 return arrptr[n2];
333}
334
335text_t mapconvert::convert (const text_t &instr) {
336 if (!mapdata.loaded) return absentc;
337
338 text_t outstr;
339 text_t::const_iterator here = instr.begin();
340 text_t::const_iterator end = instr.end();
341
342 while (here != end) {
343 outstr.push_back(this->convert(*here));
344 here++;
345 }
346
347 return outstr;
348}
349
350
351
352
353mapinconvertclass::mapinconvertclass () {
354 mapbuflen = 0;
355}
356
357void mapinconvertclass::reset () {
358 start = NULL;
359 len = 0;
360 mapbuflen=0;
361}
362
363void mapinconvertclass::convert (text_t &output, status_t &status) {
364 output.clear();
365
366 if (start == NULL || len == 0) {
367 if (mapbuflen == 0) status = finished;
368 else status = stopped;
369 return;
370 }
371
372 // don't want any funny sign conversions happening
373 unsigned char *here = (unsigned char *)start;
374
375 size_t charlen = getmapcharlen ();
376 while (len > 0) {
377 if (charlen == 0) {
378 // start parsing a new character
379 mapbuflen = 0;
380 mapbuf[mapbuflen++] = *here;
381 ++here;
382 --len;
383 charlen = getmapcharlen ();
384
385 } else if (mapbuflen < charlen) {
386 // assumes charlen is always less than MAXMAPCHARLEN
387 mapbuf[mapbuflen++] = *here;
388 ++here;
389 --len;
390 }
391
392 if (mapbuflen == charlen) {
393 // got a complete character
394 if (charlen == 1) {
395 // ascii character
396 output.push_back (mapbuf[0]);
397
398 } else {
399 // two byte character
400 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
401 (unsigned short)mapbuf[1]));
402 }
403
404 mapbuflen = 0;
405 charlen = 0;
406 }
407 }
408
409 start = (char *)here; // save current position
410
411 if (mapbuflen == 0) status = finished;
412 else status = stopped;
413}
414
415
416
417mapoutconvertclass::mapoutconvertclass () {
418 mapbuflen=0;
419 mapbufhere=0;
420}
421
422void mapoutconvertclass::reset () {
423 input = NULL;
424 outs = NULL;
425 mapbuflen = 0;
426 mapbufhere = 0;
427}
428
429// note that convert does not null-terminate the
430// output array of characters
431void mapoutconvertclass::convert (char *output, size_t maxlen,
432 size_t &len, status_t &status) {
433 unsigned short outc;
434
435 if (input == NULL || output == NULL) {
436 if (mapbuflen == 0) status = finished;
437 else status = unfinished;
438 return;
439 }
440
441 // don't want any funny sign conversions happening
442 unsigned char *uoutput = (unsigned char *)output;
443 text_t::iterator textend = input->end();
444 len = 0;
445 while (len < maxlen) {
446 // empty the contents of the internal buffer
447 if (mapbuflen > 0) {
448 while (len < maxlen && mapbufhere < mapbuflen) {
449 *uoutput = mapbuf[mapbufhere];
450 uoutput++;
451 len++;
452 mapbufhere++;
453 }
454
455 if (mapbufhere == mapbuflen) {
456 mapbufhere = 0;
457 mapbuflen = 0;
458 }
459 }
460
461 // fill up the buffer with the next character
462 if (mapbuflen == 0) {
463 if (texthere == textend) break; // finished!
464 if (!rzws || (*texthere != 0x200b)) {
465 if (*texthere < 0x80) {
466 mapbuf[0] = (unsigned char)*texthere;
467 mapbuflen = 1;
468 } else {
469 outc = converter.convert (*texthere);
470 mapbuf[0] = (unsigned char)(outc >> 8);
471 mapbuf[1] = (unsigned char)(outc & 0xff);
472 mapbuflen = 2;
473 }
474 }
475
476 texthere++;
477 mapbufhere = 0;
478 }
479 }
480
481 if (texthere == textend && mapbuflen == 0) status = finished;
482 else status = unfinished;
483}
Note: See TracBrowser for help on using the repository browser.