root/trunk/gsdl/lib/gsdlunicode.h @ 1927

Revision 1927, 7.4 KB (checked in by sjboddie, 19 years ago)

Fixed a bug in the C++ encoding support - 8 bit encodings like windows-1251
were being treated as 16 bit encodings in some places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42
43
44#define MAXUTF8CHARLEN 3
45
46// convert from a utf-8 char stream to the text_t class
47class utf8inconvertclass : public inconvertclass {
48public:
49  utf8inconvertclass();
50  void reset ();
51  void convert (text_t &output, status_t &status);
52
53protected:
54  // buffer to hold unconverted characters in a stream
55  unsigned char utf8buf[MAXUTF8CHARLEN];
56  size_t utf8buflen;
57
58  // returns the length that the current contents of the
59  // utf8buf should be
60  size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73  rzwsoutconvertclass () {rzws = 0;};
74  void set_rzws (int new_rzws) {rzws = new_rzws;};
75
76protected:
77  int rzws;
78};
79
80
81// Convert from a text_t class to a utf-8 char stream
82class utf8outconvertclass : public rzwsoutconvertclass {
83public:
84  utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
85  void reset ();
86  // note that convert does not null-terminate the
87  // output array of characters
88  void convert (char *output, size_t maxlen,
89        size_t &len, status_t &status);
90
91protected:
92  unsigned char utf8buf[MAXUTF8CHARLEN];
93  size_t utf8buflen;
94  size_t utf8bufhere;
95};
96
97
98// mapdata_t is used by mapconvert to hold the map file data
99class mapdata_t {
100public:
101  mapdata_t();
102  bool loaded;
103  unsigned short *ptrs[256];
104};
105
106// mapconvert is used in situations where conversion is best
107// done using a map file. The mapfile should reside in
108// gsdlhome/unicode.
109class mapconvert {
110public:
111  mapconvert ();
112  ~mapconvert () {unloadmapfile();};
113
114  // setmapfile will cause loadmapfile to be called when conversion is
115  // needed
116  bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
117
118  // loadmapfile should be called before any conversion is done
119  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
120  void unloadmapfile ();
121
122  unsigned short convert (unsigned short c);
123
124  // note that this version of convert has different semantics to
125  // the convertclass version.
126  text_t convert (const text_t &instr);
127
128protected:
129  text_t mapfile;
130  unsigned short absentc;
131  mapdata_t mapdata;
132};
133
134
135
136#define MAXMAPCHARLEN 2
137
138// convert from a gb char stream to the unicode text_t class
139class mapinconvertclass : public inconvertclass {
140public:
141  mapinconvertclass();
142  virtual ~mapinconvertclass() {};
143
144  // setmapfile will cause loadmapfile to be called when conversion is needed
145  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
146    return converter.setmapfile (themapfile, theabsentc);
147  };
148
149  // loadmapfile should be called before any conversion takes
150  // place
151  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
152    return converter.loadmapfile (themapfile, theabsentc);
153  };
154
155  void set_multibyte (int new_multibyte) {multibyte = new_multibyte;};
156
157  void reset ();
158  void convert (text_t &output, status_t &status);
159
160protected:
161  // buffer to hold unconverted characters in a stream
162  unsigned char mapbuf[MAXMAPCHARLEN];
163  size_t mapbuflen;
164  int multibyte;
165
166  // note: multiple instances of mapinconvert class are expensive
167  // as each will have its own copy of the map file data. This
168  // could be reduced by making map2unimap static, but then it
169  // wouldn't be thread safe.
170  mapconvert converter;
171
172  // returns the length that the current contents of the
173  // mapbuf should be
174  inline size_t getmapcharlen () {
175    if (mapbuflen == 0) return 0;
176    if (mapbuf[0] < 0x80) return 1;
177    if (!multibyte) return 1;
178    return 2;
179  }
180};
181
182
183// Convert from a text_t class to a map char stream
184class mapoutconvertclass : public rzwsoutconvertclass {
185public:
186  mapoutconvertclass ();
187  virtual ~mapoutconvertclass() {};
188
189  // setmapfile will cause loadmapfile to be called when conversion is needed
190  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
191    return converter.setmapfile (themapfile, theabsentc);
192  };
193
194  // loadmapfile should be called before any conversion takes
195  // place
196  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
197    return converter.loadmapfile (themapfile, theabsentc);
198  };
199
200  void set_multibyte (int new_multibyte) {multibyte = new_multibyte;};
201
202  void reset ();
203  void convert (char *output, size_t maxlen,
204        size_t &len, status_t &status);
205
206protected:
207  unsigned char mapbuf[MAXMAPCHARLEN];
208  size_t mapbuflen;
209  size_t mapbufhere;
210  int multibyte;
211
212  mapconvert converter;
213};
214
215
216// Simple input and output converter classes for use with 8 bit encodings
217// using simple textual map files. Map files should contain (at least) two
218// tab-separated fields. The first field is the mapped value and the second
219// field is the unicode value.
220
221struct ltus_t
222{
223  bool operator()(const unsigned short &t1, const unsigned short &t2) const
224  { return t1 < t2; }
225};
226
227
228class simplemapconvert {
229public:
230  simplemapconvert () {absentc=0; loaded=false;}
231  unsigned short convert (unsigned short c, bool in);
232  void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
233
234protected:
235  bool loadmapfile (bool in);
236
237  map <unsigned short, unsigned short, ltus_t> mapping;
238  bool loaded;
239  text_t mapfile;
240  unsigned short absentc;
241};
242
243
244class simplemapinconvertclass : public inconvertclass {
245public:
246  virtual ~simplemapinconvertclass () {}
247
248  void convert (text_t &output, status_t &status);
249
250  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
251 
252protected:
253  simplemapconvert converter;
254};
255
256class simplemapoutconvertclass : public rzwsoutconvertclass {
257public:
258  virtual ~simplemapoutconvertclass () {}
259
260  void convert (char *output, size_t maxlen,
261        size_t &len, status_t &status);
262
263  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
264 
265protected:
266  simplemapconvert converter;
267};
268
269
270
271
272#endif
Note: See TracBrowser for help on using the browser.