source: main/trunk/greenstone2/common-src/src/lib/gsdlunicode.h@ 22141

Last change on this file since 22141 was 22141, checked in by davidb, 14 years ago

Was surprised to discover some classes that did not correctly specify virtual on its destructor, even though virutal was being used on other methods in the class, or else through inheritance. Now fixed up.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42#define MAXUTF8CHARLEN 3
43
44// convert from a utf-8 char stream to the text_t class
45class utf8inconvertclass : public inconvertclass {
46public:
47 utf8inconvertclass();
48 virtual ~utf8inconvertclass();
49
50 virtual void reset ();
51 virtual void convert (text_t &output, status_t &status);
52
53protected:
54 // buffer to hold unconverted characters in a stream
55 unsigned char utf8buf[MAXUTF8CHARLEN];
56 size_t utf8buflen;
57
58 // returns the length that the current contents of the
59 // utf8buf should be
60 size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73 rzwsoutconvertclass () {rzws = 0;};
74 virtual ~rzwsoutconvertclass () {};
75
76 void set_rzws (int new_rzws) {rzws = new_rzws;};
77
78protected:
79 int rzws;
80};
81
82// utf16 is almost the same as unicode, except for unicode values > 65535.
83class utf16outconvertclass : public rzwsoutconvertclass {
84public:
85 utf16outconvertclass () {};
86 virtual ~utf16outconvertclass () {};
87
88 virtual void convert (char *out, size_t maxlen, size_t &len, status_t &status);
89};
90
91
92// Convert from a text_t class to a utf-8 char stream
93class utf8outconvertclass : public rzwsoutconvertclass {
94public:
95 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
96 virtual ~utf8outconvertclass () {};
97
98 virtual void reset ();
99 // note that convert does not null-terminate the
100 // output array of characters
101 void convert (char *output, size_t maxlen,
102 size_t &len, status_t &status);
103
104protected:
105 unsigned char utf8buf[MAXUTF8CHARLEN];
106 size_t utf8buflen;
107 size_t utf8bufhere;
108};
109
110
111// mapdata_t is used by mapconvert to hold the map file data
112class mapdata_t {
113public:
114 mapdata_t();
115 bool loaded;
116 unsigned short *ptrs[256];
117};
118
119// mapconvert is used in situations where conversion is best
120// done using a map file. The mapfile should reside in
121// gsdlhome/unicode.
122class mapconvert {
123public:
124 mapconvert ();
125 ~mapconvert () {unloadmapfile();};
126
127 // setmapfile will cause loadmapfile to be called when conversion is
128 // needed
129 bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
130
131 // loadmapfile should be called before any conversion is done
132 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
133 void unloadmapfile ();
134
135 unsigned short convert (unsigned short c);
136
137 // note that this version of convert has different semantics to
138 // the convertclass version.
139 text_t convert (const text_t &instr);
140
141protected:
142 text_t mapfile;
143 unsigned short absentc;
144 mapdata_t mapdata;
145};
146
147
148
149#define MAXMAPCHARLEN 2
150
151// convert from a gb char stream to the unicode text_t class
152class mapinconvertclass : public inconvertclass {
153public:
154 mapinconvertclass();
155 virtual ~mapinconvertclass() {};
156
157 // setmapfile will cause loadmapfile to be called when conversion is needed
158 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
159 return converter.setmapfile (themapfile, theabsentc);
160 };
161
162 // loadmapfile should be called before any conversion takes
163 // place
164 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
165 return converter.loadmapfile (themapfile, theabsentc);
166 };
167
168 void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};
169
170 void reset ();
171 void convert (text_t &output, status_t &status);
172
173protected:
174 // buffer to hold unconverted characters in a stream
175 unsigned char mapbuf[MAXMAPCHARLEN];
176 size_t mapbuflen;
177 int m_multibyte;
178
179 // note: multiple instances of mapinconvert class are expensive
180 // as each will have its own copy of the map file data. This
181 // could be reduced by making map2unimap static, but then it
182 // wouldn't be thread safe.
183 mapconvert converter;
184
185 // returns the length that the current contents of the
186 // mapbuf should be
187 inline size_t getmapcharlen () {
188 if (mapbuflen == 0) return 0;
189 if (mapbuf[0] < 0x80) return 1;
190 if (!m_multibyte) return 1;
191 return 2;
192 }
193};
194
195
196// Convert from a text_t class to a map char stream
197class mapoutconvertclass : public rzwsoutconvertclass {
198public:
199 mapoutconvertclass ();
200 virtual ~mapoutconvertclass() {};
201
202 // setmapfile will cause loadmapfile to be called when conversion is needed
203 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
204 return converter.setmapfile (themapfile, theabsentc);
205 };
206
207 // loadmapfile should be called before any conversion takes
208 // place
209 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
210 return converter.loadmapfile (themapfile, theabsentc);
211 };
212
213 void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};
214
215 void reset ();
216 void convert (char *output, size_t maxlen,
217 size_t &len, status_t &status);
218
219protected:
220 unsigned char mapbuf[MAXMAPCHARLEN];
221 size_t mapbuflen;
222 size_t mapbufhere;
223 int m_multibyte;
224
225 mapconvert converter;
226};
227
228
229// Simple input and output converter classes for use with 8 bit encodings
230// using simple textual map files. Map files should contain (at least) two
231// tab-separated fields. The first field is the mapped value and the second
232// field is the unicode value.
233
234struct ltus_t
235{
236 bool operator()(const unsigned short &t1, const unsigned short &t2) const
237 { return t1 < t2; }
238};
239
240
241class simplemapconvert {
242public:
243 simplemapconvert () {absentc=0; loaded=false;}
244 unsigned short convert (unsigned short c, bool in);
245 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
246
247protected:
248 bool loadmapfile (bool in);
249
250 map <unsigned short, unsigned short, ltus_t> mapping;
251 bool loaded;
252 text_t mapfile;
253 unsigned short absentc;
254};
255
256
257class simplemapinconvertclass : public inconvertclass {
258public:
259 virtual ~simplemapinconvertclass () {}
260
261 void convert (text_t &output, status_t &status);
262
263 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
264
265protected:
266 simplemapconvert converter;
267};
268
269class simplemapoutconvertclass : public rzwsoutconvertclass {
270public:
271 virtual ~simplemapoutconvertclass () {}
272
273 void convert (char *output, size_t maxlen,
274 size_t &len, status_t &status);
275
276 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
277
278protected:
279 simplemapconvert converter;
280};
281
282
283
284
285#endif
Note: See TracBrowser for help on using the repository browser.