root/trunk/gsdl/lib/gsdlunicode.cpp @ 1927

Revision 1927, 14.7 KB (checked in by sjboddie, 19 years ago)

Fixed a bug in the C++ encoding support - 8 bit encodings like windows-1251
were being treated as 16 bit encodings in some places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31#include "unitool.h"
32
33#include "fileutil.h"
34
35#include <stdio.h>
36
37#if defined(GSDL_USE_OBJECTSPACE)
38#  include <ospace\std\iostream>
39#  include <ospace\std\fstream>
40#elif defined(GSDL_USE_IOS_H)
41#  include <iostream.h>
42#  include <fstream.h>
43#else
44#  include <iostream>
45#  include <fstream>
46#endif
47
48
49// converts a unicode encode text_t string to a utf-8
50// encoded text_t string
51text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
52  text_t out;
53
54  unsigned char thischar[MAXUTF8CHARLEN];
55  int i, charlen;
56
57  while (here != end) {
58    charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
59    for (i=0; i<charlen; i++) out.push_back(thischar[i]);
60    here++;
61  }
62
63  return out;
64}
65
66// converts a utf-8 encoded text_t string to a unicode
67// encoded text_t string
68text_t to_uni (const text_t &in) {
69  text_t out;
70  unsigned char *in_cstr = (unsigned char *)in.getcstr();
71  unsigned char *here = in_cstr;
72  unsigned char *end = in_cstr;
73
74  unsigned short unichar;
75  int charlen = 0;
76
77  // get the last valid character in the string
78  while (*end != '\0') end++;
79  end--;
80
81  while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
82    out.push_back(unichar);
83    here += charlen;
84  }
85
86  delete in_cstr;
87
88  return out;
89}
90
91
92
93utf8inconvertclass::utf8inconvertclass () {
94  utf8buflen = 0;
95}
96
97void utf8inconvertclass::reset () {
98  start = NULL;
99  len = 0;
100  utf8buflen=0;
101}
102
103void utf8inconvertclass::convert (text_t &output, status_t &status) {
104  output.clear();
105  output.reserve (len/3);
106 
107  if (start == NULL || len == 0) {
108    if (utf8buflen == 0) status = finished;
109    else status = stopped;
110    return;
111  }
112
113  // don't want any funny sign conversions happening
114  unsigned char *here = (unsigned char *)start;
115  unsigned char *end = here+len-1;
116  unsigned short c;
117  size_t realcharlen;
118
119  size_t charlen = getutf8charlen ();
120  while (len > 0) {
121    if (charlen == 0) {
122      // start parsing a new character
123      utf8buflen = 0;
124
125      // fast common case
126      while (len > 3) {
127    realcharlen = parse_utf8_char (here, end, &c);
128    output.push_back (c);
129    here += realcharlen;
130    len -= realcharlen;
131      }
132
133      utf8buf[utf8buflen++] = *here;
134      ++here;
135      --len;
136      charlen = getutf8charlen ();
137
138    } else if (utf8buflen < charlen) {
139      // assumes charlen is always less than MAXUTF8CHARLEN
140      utf8buf[utf8buflen++] = *here;
141      ++here;
142      --len;
143    }
144
145    if (utf8buflen == charlen) {
146      // got a complete character
147      realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
148      output.push_back (c);
149     
150      // move any unparsed characters. If an error occurred some of
151      // the characters might be unused.
152      int i;
153      int diff = utf8buflen - realcharlen;
154      for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff];
155      utf8buflen = diff;
156      charlen = getutf8charlen ();
157    }
158  }
159
160  start = (char *)here; // save current position
161
162  if (utf8buflen == 0) status = finished;
163  else status = stopped;
164}
165
166
167// returns the length that the current contents of the
168// utf8buf should be
169size_t utf8inconvertclass::getutf8charlen () {
170  if (utf8buflen == 0) return 0;
171
172  // one byte character
173  if (utf8buf[0] < 0x80) return 1;
174
175  // error, is not the start of a utf-8 character
176  if (utf8buf[0] < 0xc0) return 1;
177
178  // two bute character
179  if (utf8buf[0] < 0xe0) return 2;
180
181  // three byte character
182  if (utf8buf[0] < 0xf0) return 3;
183
184  // error, character too long for unicode
185  return 1;
186}
187
188
189void utf8outconvertclass::reset () {
190  input = NULL;
191  outs = NULL;
192  utf8buflen = 0;
193  utf8bufhere = 0;
194}
195
196// note that convert does not null-terminate the
197// output array of characters
198void utf8outconvertclass::convert (char *output, size_t maxlen,
199                   size_t &len, status_t &status) {
200  if (input == NULL || output == NULL) {
201    if (utf8buflen == 0) status = finished;
202    else status = unfinished;
203    return;
204  }
205
206  // don't want any funny sign conversions happening
207  unsigned char *uoutput = (unsigned char *)output;
208  text_t::iterator textend = input->end();
209  len = 0;
210  while (len < maxlen) {
211    // empty the contents of the internal buffer
212    if (utf8buflen > 0) {
213      while (len < maxlen && utf8bufhere < utf8buflen) {
214    *uoutput = utf8buf[utf8bufhere];
215    uoutput++;
216    len++;
217    utf8bufhere++;
218      }
219
220      if (utf8bufhere == utf8buflen) {
221    utf8bufhere = 0;
222    utf8buflen = 0;
223      }
224    }
225
226    // fill up the buffer with the next character
227    if (utf8buflen == 0) {
228      if (texthere == textend) break; // finished!
229      if (!rzws || (*texthere != 0x200b))
230    utf8buflen = output_utf8_char (*texthere, utf8buf,
231                       &utf8buf[MAXUTF8CHARLEN-1]);
232      texthere++;
233      utf8bufhere = 0;
234    }
235  }
236 
237  if (texthere == textend && utf8buflen == 0) status = finished;
238  else status = unfinished;
239}
240
241
242
243
244
245
246mapdata_t::mapdata_t () {
247  int i;
248
249  // reset all the map ptrs to be NULL
250  for (i=0; i<256; i++) {
251    ptrs[i] = (unsigned short *)NULL;
252  }
253
254  // say nothing has been loaded
255  loaded = false;
256}
257
258
259mapconvert::mapconvert () {
260  absentc = 0;
261}
262
263// setmapfile will cause loadmapfile to be called when conversion is
264// needed
265bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
266  // check to see if the mapfile has been already loaded
267  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
268
269  unloadmapfile ();
270  mapfile = themapfile;
271  absentc = theabsentc;
272 
273  return true;
274}
275
276
277
278// loadmapfile should be called before any conversion is done
279bool mapconvert::loadmapfile (const text_t &themapfile,
280                  unsigned short theabsentc) {
281  FILE *mapfilein = (FILE *)NULL;
282
283  // check to see if the mapfile has been already loaded
284  if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
285
286  unloadmapfile ();
287  mapfile = themapfile;
288  absentc = theabsentc;
289
290  // open the map file
291  char *cfilename = mapfile.getcstr();
292  if (cfilename == (char *)NULL) return false;
293  mapfilein = fopen(cfilename, "rb");
294  delete cfilename;
295
296  if (mapfilein == (FILE *)NULL) return false;
297
298  unsigned char c, n1, n2;
299  unsigned short *arrptr;
300  int i;
301  c = fgetc (mapfilein);
302  while (!feof (mapfilein)) {
303    if (mapdata.ptrs[c] == (unsigned short *)NULL) {
304      // allocate a new array
305      arrptr = new unsigned short[256];
306      mapdata.ptrs[c] = arrptr;
307    } else arrptr = mapdata.ptrs[c];
308
309    // clear the array
310    for (i=0; i<256; i++) arrptr[i] = 0;
311   
312    // read in this block
313    n1 = fgetc (mapfilein);
314    n2 = fgetc (mapfilein);
315    i=0;
316    while (!feof (mapfilein)) {
317      arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
318
319      i++;
320      if (i >= 256) break;
321      n1 = fgetc (mapfilein);
322      n2 = fgetc (mapfilein);
323    }
324
325    c = fgetc (mapfilein);
326  }
327
328  mapdata.loaded = true;
329
330  return true;
331}
332
333void mapconvert::unloadmapfile () {
334  if (!mapdata.loaded) return;
335
336  int i;
337  for (i=0; i<256; i++) {
338    if (mapdata.ptrs[i] != (unsigned short *)NULL) {
339      delete [] mapdata.ptrs[i];
340      mapdata.ptrs[i] = (unsigned short *)NULL;
341    }
342  }
343
344  mapdata.loaded = false;
345}
346
347
348unsigned short mapconvert::convert (unsigned short c) {
349  if (!mapdata.loaded) {
350    if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
351      // do nothing, successfully loaded database
352    } else return absentc;
353  }
354
355  if (c == 0) return 0; // 0 always maps to 0...
356
357  unsigned short n1 = c >> 8;
358  unsigned short n2 = c & 0xff;
359
360  unsigned short *arrptr = mapdata.ptrs[n1];
361  if (arrptr == (unsigned short *)NULL) return absentc;
362
363  if (arrptr[n2] == 0) return absentc;
364  return arrptr[n2];
365}
366
367text_t mapconvert::convert (const text_t &instr) {
368  if (!mapdata.loaded) return absentc;
369
370  text_t outstr;
371  text_t::const_iterator here = instr.begin();
372  text_t::const_iterator end = instr.end();
373
374  while (here != end) {
375    outstr.push_back(this->convert(*here));
376    here++;
377  }
378 
379  return outstr;
380}
381
382
383
384
385mapinconvertclass::mapinconvertclass () {
386  multibyte = 0;
387  mapbuflen = 0;
388}
389
390void mapinconvertclass::reset () {
391  start = NULL;
392  len = 0;
393  mapbuflen=0;
394}
395
396void mapinconvertclass::convert (text_t &output, status_t &status) {
397  output.clear();
398
399  if (start == NULL || len == 0) {
400    if (mapbuflen == 0) status = finished;
401    else status = stopped;
402    return;
403  }
404
405  // don't want any funny sign conversions happening
406  unsigned char *here = (unsigned char *)start;
407
408  size_t charlen = getmapcharlen ();
409  while (len > 0) {
410    if (charlen == 0) {
411      // start parsing a new character
412      mapbuflen = 0;
413      mapbuf[mapbuflen++] = *here;
414      ++here;
415      --len;
416      charlen = getmapcharlen ();
417
418    } else if (mapbuflen < charlen) {
419      // assumes charlen is always less than MAXMAPCHARLEN
420      mapbuf[mapbuflen++] = *here;
421      ++here;
422      --len;
423    }
424
425    if (mapbuflen == charlen) {
426      // got a complete character
427      if (charlen == 1) {
428    if (mapbuf[0] < 0x80) {
429      // ascii character
430      output.push_back (mapbuf[0]);
431    } else {
432      output.push_back (converter.convert((unsigned short)mapbuf[0]));
433    }
434
435      } else {
436    // two byte character
437    output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
438                        (unsigned short)mapbuf[1]));
439      }
440
441      mapbuflen = 0;
442      charlen = 0;
443    }
444  }
445
446  start = (char *)here; // save current position
447
448  if (mapbuflen == 0) status = finished;
449  else status = stopped;
450}
451
452
453
454mapoutconvertclass::mapoutconvertclass () {
455  multibyte = 0;
456  mapbuflen=0;
457  mapbufhere=0;
458}
459
460void mapoutconvertclass::reset () {
461  input = NULL;
462  outs = NULL;
463  mapbuflen = 0;
464  mapbufhere = 0;
465}
466
467// note that convert does not null-terminate the
468// output array of characters
469void mapoutconvertclass::convert (char *output, size_t maxlen,
470                 size_t &len, status_t &status) {
471  unsigned short outc;
472
473  if (input == NULL || output == NULL) {
474    if (mapbuflen == 0) status = finished;
475    else status = unfinished;
476    return;
477  }
478
479  // don't want any funny sign conversions happening
480  unsigned char *uoutput = (unsigned char *)output;
481  text_t::iterator textend = input->end();
482  len = 0;
483  while (len < maxlen) {
484    // empty the contents of the internal buffer
485    if (mapbuflen > 0) {
486      while (len < maxlen && mapbufhere < mapbuflen) {
487    *uoutput = mapbuf[mapbufhere];
488    uoutput++;
489    len++;
490    mapbufhere++;
491      }
492
493      if (mapbufhere == mapbuflen) {
494    mapbufhere = 0;
495    mapbuflen = 0;
496      }
497    }
498
499    // fill up the buffer with the next character
500    if (mapbuflen == 0) {
501      if (texthere == textend) break; // finished!
502      if (!rzws || (*texthere != 0x200b)) {
503    if (*texthere < 0x80) {
504      mapbuf[0] = (unsigned char)*texthere;
505      mapbuflen = 1;
506    } else {
507      outc = converter.convert (*texthere);
508      if (multibyte) {
509        mapbuf[0] = (unsigned char)(outc >> 8);
510        mapbuf[1] = (unsigned char)(outc & 0xff);
511        mapbuflen = 2;
512      } else {
513        mapbuf[0] = outc;
514        mapbuflen = 1;
515      }
516    }
517      }
518
519      texthere++;
520      mapbufhere = 0;
521    }
522  }
523 
524  if (texthere == textend && mapbuflen == 0) status = finished;
525  else status = unfinished;
526}
527
528
529bool simplemapconvert::loadmapfile (bool in) {
530  if (loaded) return true;
531  if (mapfile.empty()) return false;
532
533  char *cfilename = mapfile.getcstr();
534#ifdef GSDL_USE_IOS_H
535  ifstream mapfilein (cfilename, ios::in | ios::nocreate);
536#else
537  ifstream mapfilein (cfilename, ios::in);
538#endif
539  delete cfilename;
540  if (!mapfilein) return false;
541
542  char cline[2048];
543  text_t line;
544
545  while (!mapfilein.eof()) {
546    mapfilein.getline (cline, 2048);
547    line.clear();
548    line.appendcstr (cline);
549    if (line.empty()) continue;
550    // remove comments
551    text_t::iterator end = line.end();
552    text_t::iterator here = findchar (line.begin(), end, '#');
553    if (here != end) {
554      line.erase (here, end);
555      if (line.empty()) continue;
556    }
557   
558    text_tarray parts;
559    splitchar (line.begin(), line.end(), '\t', parts);
560   
561    // do some simple sanity checks
562    if (parts.size() < 2) continue;
563    text_t::iterator begin1 = parts[0].begin();
564    text_t::iterator begin2 = parts[1].begin();
565    if (*begin1 != '0' || *(begin1+1) != 'x') continue;
566    if (*begin2 != '0' || *(begin2+1) != 'x') continue;
567    char *from = parts[0].getcstr();
568    char *to = parts[1].getcstr();
569    unsigned int f = 0, t = 0;
570    sscanf (from, "%i", &f);
571    sscanf (to, "%i", &t);
572    delete from;
573    delete to;
574   
575    if (in) mapping[(unsigned short)f] = (unsigned short)t;
576    else mapping[(unsigned short)t] = (unsigned short)f;
577  }
578
579  loaded = true;
580  return true;
581}
582
583unsigned short simplemapconvert::convert (unsigned short c, bool in) {
584
585  if (!loaded)
586    if (!loadmapfile(in)) return absentc;
587 
588  return mapping[c];
589}
590
591
592void simplemapinconvertclass::convert (text_t &output, status_t &status) {
593  output.clear();
594 
595  if (start == NULL || len == 0) {
596    status = finished;
597    return;
598  }
599
600  // don't want any funny sign conversions happening
601  unsigned char *here = (unsigned char *)start;
602  while (len > 0) {
603
604    if (*here < 0x80)
605      output.push_back (*here); // append this character
606    else
607      output.push_back (converter.convert(*here, true));
608
609    ++here;
610    --len;
611  }
612
613  start = (char *)here; // save current position
614  status = finished;
615}
616
617
618void simplemapoutconvertclass::convert (char *output, size_t maxlen,
619                    size_t &len, status_t &status) {
620
621  if (input == NULL || output == NULL) {
622    status = finished;
623    return;
624  }
625
626  // don't want any funny sign conversions happening
627  unsigned char *uoutput = (unsigned char *)output;
628  text_t::iterator textend = input->end();
629  len = 0;
630  while ((len < maxlen) && (texthere != textend)) {
631
632    if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
633    else *uoutput = converter.convert (*texthere, false);
634
635    ++uoutput;
636    ++len;
637    ++texthere;
638  }
639 
640  if (texthere == textend) status = finished;
641  else status = unfinished;
642}
Note: See TracBrowser for help on using the browser.