source: trunk/gsdl/lib/gsdlunicode.h@ 8666

Last change on this file since 8666 was 8666, checked in by jrm21, 19 years ago

renamed multibyte variable since that is the name of a macro on solaris.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42#define MAXUTF8CHARLEN 3
43
44// convert from a utf-8 char stream to the text_t class
45class utf8inconvertclass : public inconvertclass {
46public:
47 utf8inconvertclass();
48 void reset ();
49 void convert (text_t &output, status_t &status);
50
51protected:
52 // buffer to hold unconverted characters in a stream
53 unsigned char utf8buf[MAXUTF8CHARLEN];
54 size_t utf8buflen;
55
56 // returns the length that the current contents of the
57 // utf8buf should be
58 size_t getutf8charlen ();
59};
60
61
62// This class provides the option of removing zero width
63// spaces (U+200B) during the output. By default this
64// option is turned off. The functionality is actually
65// implemented by the sub-classes, this class just provides
66// the framework for these classes.
67//
68// Note: by convention reset() should not reset the rzws flag.
69class rzwsoutconvertclass : public outconvertclass {
70public:
71 rzwsoutconvertclass () {rzws = 0;};
72 void set_rzws (int new_rzws) {rzws = new_rzws;};
73
74protected:
75 int rzws;
76};
77
78// utf16 is almost the same as unicode, except for unicode values > 65535.
79class utf16outconvertclass : public rzwsoutconvertclass {
80public:
81 utf16outconvertclass () {};
82 void convert (char *out, size_t maxlen, size_t &len, status_t &status);
83};
84
85
86// Convert from a text_t class to a utf-8 char stream
87class utf8outconvertclass : public rzwsoutconvertclass {
88public:
89 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
90 void reset ();
91 // note that convert does not null-terminate the
92 // output array of characters
93 void convert (char *output, size_t maxlen,
94 size_t &len, status_t &status);
95
96protected:
97 unsigned char utf8buf[MAXUTF8CHARLEN];
98 size_t utf8buflen;
99 size_t utf8bufhere;
100};
101
102
103// mapdata_t is used by mapconvert to hold the map file data
104class mapdata_t {
105public:
106 mapdata_t();
107 bool loaded;
108 unsigned short *ptrs[256];
109};
110
111// mapconvert is used in situations where conversion is best
112// done using a map file. The mapfile should reside in
113// gsdlhome/unicode.
114class mapconvert {
115public:
116 mapconvert ();
117 ~mapconvert () {unloadmapfile();};
118
119 // setmapfile will cause loadmapfile to be called when conversion is
120 // needed
121 bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
122
123 // loadmapfile should be called before any conversion is done
124 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
125 void unloadmapfile ();
126
127 unsigned short convert (unsigned short c);
128
129 // note that this version of convert has different semantics to
130 // the convertclass version.
131 text_t convert (const text_t &instr);
132
133protected:
134 text_t mapfile;
135 unsigned short absentc;
136 mapdata_t mapdata;
137};
138
139
140
141#define MAXMAPCHARLEN 2
142
143// convert from a gb char stream to the unicode text_t class
144class mapinconvertclass : public inconvertclass {
145public:
146 mapinconvertclass();
147 virtual ~mapinconvertclass() {};
148
149 // setmapfile will cause loadmapfile to be called when conversion is needed
150 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
151 return converter.setmapfile (themapfile, theabsentc);
152 };
153
154 // loadmapfile should be called before any conversion takes
155 // place
156 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
157 return converter.loadmapfile (themapfile, theabsentc);
158 };
159
160 void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};
161
162 void reset ();
163 void convert (text_t &output, status_t &status);
164
165protected:
166 // buffer to hold unconverted characters in a stream
167 unsigned char mapbuf[MAXMAPCHARLEN];
168 size_t mapbuflen;
169 int m_multibyte;
170
171 // note: multiple instances of mapinconvert class are expensive
172 // as each will have its own copy of the map file data. This
173 // could be reduced by making map2unimap static, but then it
174 // wouldn't be thread safe.
175 mapconvert converter;
176
177 // returns the length that the current contents of the
178 // mapbuf should be
179 inline size_t getmapcharlen () {
180 if (mapbuflen == 0) return 0;
181 if (mapbuf[0] < 0x80) return 1;
182 if (!m_multibyte) return 1;
183 return 2;
184 }
185};
186
187
188// Convert from a text_t class to a map char stream
189class mapoutconvertclass : public rzwsoutconvertclass {
190public:
191 mapoutconvertclass ();
192 virtual ~mapoutconvertclass() {};
193
194 // setmapfile will cause loadmapfile to be called when conversion is needed
195 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
196 return converter.setmapfile (themapfile, theabsentc);
197 };
198
199 // loadmapfile should be called before any conversion takes
200 // place
201 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
202 return converter.loadmapfile (themapfile, theabsentc);
203 };
204
205 void set_multibyte (int new_multibyte) {m_multibyte = new_multibyte;};
206
207 void reset ();
208 void convert (char *output, size_t maxlen,
209 size_t &len, status_t &status);
210
211protected:
212 unsigned char mapbuf[MAXMAPCHARLEN];
213 size_t mapbuflen;
214 size_t mapbufhere;
215 int m_multibyte;
216
217 mapconvert converter;
218};
219
220
221// Simple input and output converter classes for use with 8 bit encodings
222// using simple textual map files. Map files should contain (at least) two
223// tab-separated fields. The first field is the mapped value and the second
224// field is the unicode value.
225
226struct ltus_t
227{
228 bool operator()(const unsigned short &t1, const unsigned short &t2) const
229 { return t1 < t2; }
230};
231
232
233class simplemapconvert {
234public:
235 simplemapconvert () {absentc=0; loaded=false;}
236 unsigned short convert (unsigned short c, bool in);
237 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
238
239protected:
240 bool loadmapfile (bool in);
241
242 map <unsigned short, unsigned short, ltus_t> mapping;
243 bool loaded;
244 text_t mapfile;
245 unsigned short absentc;
246};
247
248
249class simplemapinconvertclass : public inconvertclass {
250public:
251 virtual ~simplemapinconvertclass () {}
252
253 void convert (text_t &output, status_t &status);
254
255 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
256
257protected:
258 simplemapconvert converter;
259};
260
261class simplemapoutconvertclass : public rzwsoutconvertclass {
262public:
263 virtual ~simplemapoutconvertclass () {}
264
265 void convert (char *output, size_t maxlen,
266 size_t &len, status_t &status);
267
268 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
269
270protected:
271 simplemapconvert converter;
272};
273
274
275
276
277#endif
Note: See TracBrowser for help on using the repository browser.