source: trunk/gsdl/lib/gsdlunicode.h@ 1927

Last change on this file since 1927 was 1927, checked in by sjboddie, 23 years ago

Fixed a bug in the C++ encoding support - 8 bit encodings like windows-1251
were being treated as 16 bit encodings in some places

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42
43
44#define MAXUTF8CHARLEN 3
45
46// convert from a utf-8 char stream to the text_t class
47class utf8inconvertclass : public inconvertclass {
48public:
49 utf8inconvertclass();
50 void reset ();
51 void convert (text_t &output, status_t &status);
52
53protected:
54 // buffer to hold unconverted characters in a stream
55 unsigned char utf8buf[MAXUTF8CHARLEN];
56 size_t utf8buflen;
57
58 // returns the length that the current contents of the
59 // utf8buf should be
60 size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73 rzwsoutconvertclass () {rzws = 0;};
74 void set_rzws (int new_rzws) {rzws = new_rzws;};
75
76protected:
77 int rzws;
78};
79
80
81// Convert from a text_t class to a utf-8 char stream
82class utf8outconvertclass : public rzwsoutconvertclass {
83public:
84 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
85 void reset ();
86 // note that convert does not null-terminate the
87 // output array of characters
88 void convert (char *output, size_t maxlen,
89 size_t &len, status_t &status);
90
91protected:
92 unsigned char utf8buf[MAXUTF8CHARLEN];
93 size_t utf8buflen;
94 size_t utf8bufhere;
95};
96
97
98// mapdata_t is used by mapconvert to hold the map file data
99class mapdata_t {
100public:
101 mapdata_t();
102 bool loaded;
103 unsigned short *ptrs[256];
104};
105
106// mapconvert is used in situations where conversion is best
107// done using a map file. The mapfile should reside in
108// gsdlhome/unicode.
109class mapconvert {
110public:
111 mapconvert ();
112 ~mapconvert () {unloadmapfile();};
113
114 // setmapfile will cause loadmapfile to be called when conversion is
115 // needed
116 bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
117
118 // loadmapfile should be called before any conversion is done
119 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
120 void unloadmapfile ();
121
122 unsigned short convert (unsigned short c);
123
124 // note that this version of convert has different semantics to
125 // the convertclass version.
126 text_t convert (const text_t &instr);
127
128protected:
129 text_t mapfile;
130 unsigned short absentc;
131 mapdata_t mapdata;
132};
133
134
135
136#define MAXMAPCHARLEN 2
137
138// convert from a gb char stream to the unicode text_t class
139class mapinconvertclass : public inconvertclass {
140public:
141 mapinconvertclass();
142 virtual ~mapinconvertclass() {};
143
144 // setmapfile will cause loadmapfile to be called when conversion is needed
145 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
146 return converter.setmapfile (themapfile, theabsentc);
147 };
148
149 // loadmapfile should be called before any conversion takes
150 // place
151 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
152 return converter.loadmapfile (themapfile, theabsentc);
153 };
154
155 void set_multibyte (int new_multibyte) {multibyte = new_multibyte;};
156
157 void reset ();
158 void convert (text_t &output, status_t &status);
159
160protected:
161 // buffer to hold unconverted characters in a stream
162 unsigned char mapbuf[MAXMAPCHARLEN];
163 size_t mapbuflen;
164 int multibyte;
165
166 // note: multiple instances of mapinconvert class are expensive
167 // as each will have its own copy of the map file data. This
168 // could be reduced by making map2unimap static, but then it
169 // wouldn't be thread safe.
170 mapconvert converter;
171
172 // returns the length that the current contents of the
173 // mapbuf should be
174 inline size_t getmapcharlen () {
175 if (mapbuflen == 0) return 0;
176 if (mapbuf[0] < 0x80) return 1;
177 if (!multibyte) return 1;
178 return 2;
179 }
180};
181
182
183// Convert from a text_t class to a map char stream
184class mapoutconvertclass : public rzwsoutconvertclass {
185public:
186 mapoutconvertclass ();
187 virtual ~mapoutconvertclass() {};
188
189 // setmapfile will cause loadmapfile to be called when conversion is needed
190 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
191 return converter.setmapfile (themapfile, theabsentc);
192 };
193
194 // loadmapfile should be called before any conversion takes
195 // place
196 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
197 return converter.loadmapfile (themapfile, theabsentc);
198 };
199
200 void set_multibyte (int new_multibyte) {multibyte = new_multibyte;};
201
202 void reset ();
203 void convert (char *output, size_t maxlen,
204 size_t &len, status_t &status);
205
206protected:
207 unsigned char mapbuf[MAXMAPCHARLEN];
208 size_t mapbuflen;
209 size_t mapbufhere;
210 int multibyte;
211
212 mapconvert converter;
213};
214
215
216// Simple input and output converter classes for use with 8 bit encodings
217// using simple textual map files. Map files should contain (at least) two
218// tab-separated fields. The first field is the mapped value and the second
219// field is the unicode value.
220
221struct ltus_t
222{
223 bool operator()(const unsigned short &t1, const unsigned short &t2) const
224 { return t1 < t2; }
225};
226
227
228class simplemapconvert {
229public:
230 simplemapconvert () {absentc=0; loaded=false;}
231 unsigned short convert (unsigned short c, bool in);
232 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
233
234protected:
235 bool loadmapfile (bool in);
236
237 map <unsigned short, unsigned short, ltus_t> mapping;
238 bool loaded;
239 text_t mapfile;
240 unsigned short absentc;
241};
242
243
244class simplemapinconvertclass : public inconvertclass {
245public:
246 virtual ~simplemapinconvertclass () {}
247
248 void convert (text_t &output, status_t &status);
249
250 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
251
252protected:
253 simplemapconvert converter;
254};
255
256class simplemapoutconvertclass : public rzwsoutconvertclass {
257public:
258 virtual ~simplemapoutconvertclass () {}
259
260 void convert (char *output, size_t maxlen,
261 size_t &len, status_t &status);
262
263 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
264
265protected:
266 simplemapconvert converter;
267};
268
269
270
271
272#endif
Note: See TracBrowser for help on using the repository browser.