source: trunk/gsdl/lib/gsdlunicode.h@ 1870

Last change on this file since 1870 was 1870, checked in by sjboddie, 23 years ago

Tidied up language support stuff.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42
43
44#define MAXUTF8CHARLEN 3
45
46// convert from a utf-8 char stream to the text_t class
47class utf8inconvertclass : public inconvertclass {
48public:
49 utf8inconvertclass();
50 void reset ();
51 void convert (text_t &output, status_t &status);
52
53protected:
54 // buffer to hold unconverted characters in a stream
55 unsigned char utf8buf[MAXUTF8CHARLEN];
56 size_t utf8buflen;
57
58 // returns the length that the current contents of the
59 // utf8buf should be
60 size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73 rzwsoutconvertclass () {rzws = 0;};
74 void set_rzws (int new_rzws) {rzws = new_rzws;};
75
76protected:
77 int rzws;
78};
79
80
81// Convert from a text_t class to a utf-8 char stream
82class utf8outconvertclass : public rzwsoutconvertclass {
83public:
84 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
85 void reset ();
86 // note that convert does not null-terminate the
87 // output array of characters
88 void convert (char *output, size_t maxlen,
89 size_t &len, status_t &status);
90
91protected:
92 unsigned char utf8buf[MAXUTF8CHARLEN];
93 size_t utf8buflen;
94 size_t utf8bufhere;
95};
96
97
98// mapdata_t is used by mapconvert to hold the map file data
99class mapdata_t {
100public:
101 mapdata_t();
102 bool loaded;
103 unsigned short *ptrs[256];
104};
105
106// mapconvert is used in situations where conversion is best
107// done using a map file. The mapfile should reside in
108// gsdlhome/unicode.
109class mapconvert {
110public:
111 mapconvert ();
112 ~mapconvert () {unloadmapfile();};
113
114 // setmapfile will cause loadmapfile to be called when conversion is
115 // needed
116 bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
117
118 // loadmapfile should be called before any conversion is done
119 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
120 void unloadmapfile ();
121
122 unsigned short convert (unsigned short c);
123
124 // note that this version of convert has different semantics to
125 // the convertclass version.
126 text_t convert (const text_t &instr);
127
128protected:
129 text_t mapfile;
130 unsigned short absentc;
131 mapdata_t mapdata;
132};
133
134
135
136#define MAXMAPCHARLEN 2
137
138// convert from a gb char stream to the unicode text_t class
139class mapinconvertclass : public inconvertclass {
140public:
141 mapinconvertclass();
142 virtual ~mapinconvertclass() {};
143
144 // setmapfile will cause loadmapfile to be called when conversion is needed
145 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
146 return converter.setmapfile (themapfile, theabsentc);
147 };
148
149 // loadmapfile should be called before any conversion takes
150 // place
151 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
152 return converter.loadmapfile (themapfile, theabsentc);
153 };
154
155 void reset ();
156 void convert (text_t &output, status_t &status);
157
158protected:
159 // buffer to hold unconverted characters in a stream
160 unsigned char mapbuf[MAXMAPCHARLEN];
161 size_t mapbuflen;
162
163 // note: multiple instances of mapinconvert class are expensive
164 // as each will have its own copy of the map file data. This
165 // could be reduced by making map2unimap static, but then it
166 // wouldn't be thread safe.
167 mapconvert converter;
168
169 // returns the length that the current contents of the
170 // mapbuf should be
171 inline size_t getmapcharlen () {
172 if (mapbuflen == 0) return 0;
173 if (mapbuf[0] < 0x80) return 1;
174 return 2;
175 }
176};
177
178
179// Convert from a text_t class to a map char stream
180class mapoutconvertclass : public rzwsoutconvertclass {
181public:
182 mapoutconvertclass ();
183 virtual ~mapoutconvertclass() {};
184
185 // setmapfile will cause loadmapfile to be called when conversion is needed
186 bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
187 return converter.setmapfile (themapfile, theabsentc);
188 };
189
190 // loadmapfile should be called before any conversion takes
191 // place
192 bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
193 return converter.loadmapfile (themapfile, theabsentc);
194 };
195
196 void reset ();
197 void convert (char *output, size_t maxlen,
198 size_t &len, status_t &status);
199
200protected:
201 unsigned char mapbuf[MAXMAPCHARLEN];
202 size_t mapbuflen;
203 size_t mapbufhere;
204
205 mapconvert converter;
206};
207
208
209// Simple input and output converter classes for use with 8 bit encodings
210// using simple textual map files. Map files should contain (at least) two
211// tab-separated fields. The first field is the mapped value and the second
212// field is the unicode value.
213
214struct ltus_t
215{
216 bool operator()(const unsigned short &t1, const unsigned short &t2) const
217 { return t1 < t2; }
218};
219
220
221class simplemapconvert {
222public:
223 simplemapconvert () {absentc=0; loaded=false;}
224 unsigned short convert (unsigned short c, bool in);
225 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
226
227protected:
228 bool loadmapfile (bool in);
229
230 map <unsigned short, unsigned short, ltus_t> mapping;
231 bool loaded;
232 text_t mapfile;
233 unsigned short absentc;
234};
235
236
237class simplemapinconvertclass : public inconvertclass {
238public:
239 virtual ~simplemapinconvertclass () {}
240
241 void convert (text_t &output, status_t &status);
242
243 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
244
245protected:
246 simplemapconvert converter;
247};
248
249class simplemapoutconvertclass : public rzwsoutconvertclass {
250public:
251 virtual ~simplemapoutconvertclass () {}
252
253 void convert (char *output, size_t maxlen,
254 size_t &len, status_t &status);
255
256 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
257
258protected:
259 simplemapconvert converter;
260};
261
262
263
264
265#endif
Note: See TracBrowser for help on using the repository browser.