source: trunk/gsdl/lib/gsdlunicode.h@ 1310

Last change on this file since 1310 was 1310, checked in by sjboddie, 24 years ago

Removed CVS logging information from source files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42
43
44#define MAXUTF8CHARLEN 3
45
46// convert from a utf-8 char stream to the text_t class
47class utf8inconvertclass : public inconvertclass {
48public:
49 utf8inconvertclass();
50 void reset ();
51 void convert (text_t &output, status_t &status);
52
53protected:
54 // buffer to hold unconverted characters in a stream
55 unsigned char utf8buf[MAXUTF8CHARLEN];
56 size_t utf8buflen;
57
58 // returns the length that the current contents of the
59 // utf8buf should be
60 size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73 rzwsoutconvertclass () {rzws = 0;};
74 void set_rzws (int new_rzws) {rzws = new_rzws;};
75
76protected:
77 int rzws;
78};
79
80
81// Convert from a text_t class to a utf-8 char stream
82class utf8outconvertclass : public rzwsoutconvertclass {
83public:
84 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
85 void reset ();
86 // note that convert does not null-terminate the
87 // output array of characters
88 void convert (char *output, size_t maxlen,
89 size_t &len, status_t &status);
90
91protected:
92 unsigned char utf8buf[MAXUTF8CHARLEN];
93 size_t utf8buflen;
94 size_t utf8bufhere;
95};
96
97
98// mapdata_t is used by mapconvert to hold the map file data
99class mapdata_t {
100public:
101 mapdata_t();
102 bool loaded;
103 unsigned short *ptrs[256];
104};
105
106// mapconvert is used in situations where conversion is best
107// done using a map file. The mapfile should reside in
108// gsdlhome/unicode.
109class mapconvert {
110public:
111 mapconvert ();
112 ~mapconvert () {unloadmapfile();};
113
114 // setmapfile will cause loadmapfile to be called when conversion is
115 // needed
116 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
117 unsigned short theabsentc);
118
119 // loadmapfile should be called before any conversion is done
120 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
121 unsigned short theabsentc);
122 void unloadmapfile ();
123
124 unsigned short convert (unsigned short c);
125
126 // note that this version of convert has different semantics to
127 // the convertclass version.
128 text_t convert (const text_t &instr);
129
130protected:
131 text_t gsdlhome;
132 text_t encoding;
133 unsigned short absentc;
134 mapdata_t mapdata;
135};
136
137
138
139#define MAXMAPCHARLEN 2
140
141// convert from a gb char stream to the unicode text_t class
142class mapinconvertclass : public inconvertclass {
143public:
144 mapinconvertclass();
145 virtual ~mapinconvertclass() {};
146
147 // setmapfile will cause loadmapfile to be called when conversion is needed
148 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
149 unsigned short theabsentc) {
150 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
151 };
152
153 // loadmapfile should be called before any conversion takes
154 // place
155 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
156 unsigned short theabsentc) {
157 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
158 };
159
160 void reset ();
161 void convert (text_t &output, status_t &status);
162
163protected:
164 // buffer to hold unconverted characters in a stream
165 unsigned char mapbuf[MAXMAPCHARLEN];
166 size_t mapbuflen;
167
168 // note: multiple instances of mapinconvert class are expensive
169 // as each will have its own copy of the map file data. This
170 // could be reduced by making map2unimap static, but then it
171 // wouldn't be thread safe.
172 mapconvert converter;
173
174 // returns the length that the current contents of the
175 // mapbuf should be
176 inline size_t getmapcharlen () {
177 if (mapbuflen == 0) return 0;
178 if (mapbuf[0] < 0x80) return 1;
179 return 2;
180 }
181};
182
183
184// Convert from a text_t class to a map char stream
185class mapoutconvertclass : public rzwsoutconvertclass {
186public:
187 mapoutconvertclass ();
188 virtual ~mapoutconvertclass() {};
189
190 // setmapfile will cause loadmapfile to be called when conversion is needed
191 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
192 unsigned short theabsentc) {
193 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
194 };
195
196 // loadmapfile should be called before any conversion takes
197 // place
198 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
199 unsigned short theabsentc) {
200 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
201 };
202
203 void reset ();
204 void convert (char *output, size_t maxlen,
205 size_t &len, status_t &status);
206
207protected:
208 unsigned char mapbuf[MAXMAPCHARLEN];
209 size_t mapbuflen;
210 size_t mapbufhere;
211
212 mapconvert converter;
213};
214
215
216// Simple input and output converter classes for use with 8 bit encodings
217// using simple textual map files. Map files should contain (at least) two
218// tab-separated fields. The first field is the mapped value and the second
219// field is the unicode value.
220
221struct ltus_t
222{
223 bool operator()(const unsigned short &t1, const unsigned short &t2) const
224 { return t1 < t2; }
225};
226
227
228class simplemapconvert {
229public:
230 simplemapconvert () {absentc=0; loaded=false;}
231 unsigned short convert (unsigned short c, bool in);
232 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
233
234protected:
235 bool loadmapfile (bool in);
236
237 map <unsigned short, unsigned short, ltus_t> mapping;
238 bool loaded;
239 text_t mapfile;
240 unsigned short absentc;
241};
242
243
244class simplemapinconvertclass : public inconvertclass {
245public:
246 virtual ~simplemapinconvertclass () {}
247
248 void convert (text_t &output, status_t &status);
249
250 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
251
252protected:
253 simplemapconvert converter;
254};
255
256class simplemapoutconvertclass : public rzwsoutconvertclass {
257public:
258 virtual ~simplemapoutconvertclass () {}
259
260 void convert (char *output, size_t maxlen,
261 size_t &len, status_t &status);
262
263 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
264
265protected:
266 simplemapconvert converter;
267};
268
269
270
271
272#endif
Note: See TracBrowser for help on using the repository browser.