source: branches/New_Config_Format-branch/gsdl/lib/gsdlunicode.h@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.h 1279 2000-07-12 22:21:53Z sjboddie $
25 *
26 *********************************************************************/
27
28
29#ifndef GSDLUNICODE_H
30#define GSDLUNICODE_H
31
32#include "text_t.h"
33
34
35// converts a unicode encode text_t string to a utf-8
36// encoded text_t string
37text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
38inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
39
40// converts a utf-8 encoded text_t string to a unicode
41// encoded text_t string
42text_t to_uni (const text_t &in);
43
44
45
46#define MAXUTF8CHARLEN 3
47
48// convert from a utf-8 char stream to the text_t class
49class utf8inconvertclass : public inconvertclass {
50public:
51 utf8inconvertclass();
52 void reset ();
53 void convert (text_t &output, status_t &status);
54
55protected:
56 // buffer to hold unconverted characters in a stream
57 unsigned char utf8buf[MAXUTF8CHARLEN];
58 size_t utf8buflen;
59
60 // returns the length that the current contents of the
61 // utf8buf should be
62 size_t getutf8charlen ();
63};
64
65
66// This class provides the option of removing zero width
67// spaces (U+200B) during the output. By default this
68// option is turned off. The functionality is actually
69// implemented by the sub-classes, this class just provides
70// the framework for these classes.
71//
72// Note: by convention reset() should not reset the rzws flag.
73class rzwsoutconvertclass : public outconvertclass {
74public:
75 rzwsoutconvertclass () {rzws = 0;};
76 void set_rzws (int new_rzws) {rzws = new_rzws;};
77
78protected:
79 int rzws;
80};
81
82
83// Convert from a text_t class to a utf-8 char stream
84class utf8outconvertclass : public rzwsoutconvertclass {
85public:
86 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
87 void reset ();
88 // note that convert does not null-terminate the
89 // output array of characters
90 void convert (char *output, size_t maxlen,
91 size_t &len, status_t &status);
92
93protected:
94 unsigned char utf8buf[MAXUTF8CHARLEN];
95 size_t utf8buflen;
96 size_t utf8bufhere;
97};
98
99
100// mapdata_t is used by mapconvert to hold the map file data
101class mapdata_t {
102public:
103 mapdata_t();
104 bool loaded;
105 unsigned short *ptrs[256];
106};
107
108// mapconvert is used in situations where conversion is best
109// done using a map file. The mapfile should reside in
110// gsdlhome/unicode.
111class mapconvert {
112public:
113 mapconvert ();
114 ~mapconvert () {unloadmapfile();};
115
116 // setmapfile will cause loadmapfile to be called when conversion is
117 // needed
118 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
119 unsigned short theabsentc);
120
121 // loadmapfile should be called before any conversion is done
122 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
123 unsigned short theabsentc);
124 void unloadmapfile ();
125
126 unsigned short convert (unsigned short c);
127
128 // note that this version of convert has different semantics to
129 // the convertclass version.
130 text_t convert (const text_t &instr);
131
132protected:
133 text_t gsdlhome;
134 text_t encoding;
135 unsigned short absentc;
136 mapdata_t mapdata;
137};
138
139
140
141#define MAXMAPCHARLEN 2
142
143// convert from a gb char stream to the unicode text_t class
144class mapinconvertclass : public inconvertclass {
145public:
146 mapinconvertclass();
147 virtual ~mapinconvertclass() {};
148
149 // setmapfile will cause loadmapfile to be called when conversion is needed
150 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
151 unsigned short theabsentc) {
152 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
153 };
154
155 // loadmapfile should be called before any conversion takes
156 // place
157 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
158 unsigned short theabsentc) {
159 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
160 };
161
162 void reset ();
163 void convert (text_t &output, status_t &status);
164
165protected:
166 // buffer to hold unconverted characters in a stream
167 unsigned char mapbuf[MAXMAPCHARLEN];
168 size_t mapbuflen;
169
170 // note: multiple instances of mapinconvert class are expensive
171 // as each will have its own copy of the map file data. This
172 // could be reduced by making map2unimap static, but then it
173 // wouldn't be thread safe.
174 mapconvert converter;
175
176 // returns the length that the current contents of the
177 // mapbuf should be
178 inline size_t getmapcharlen () {
179 if (mapbuflen == 0) return 0;
180 if (mapbuf[0] < 0x80) return 1;
181 return 2;
182 }
183};
184
185
186// Convert from a text_t class to a map char stream
187class mapoutconvertclass : public rzwsoutconvertclass {
188public:
189 mapoutconvertclass ();
190 virtual ~mapoutconvertclass() {};
191
192 // setmapfile will cause loadmapfile to be called when conversion is needed
193 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
194 unsigned short theabsentc) {
195 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
196 };
197
198 // loadmapfile should be called before any conversion takes
199 // place
200 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
201 unsigned short theabsentc) {
202 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
203 };
204
205 void reset ();
206 void convert (char *output, size_t maxlen,
207 size_t &len, status_t &status);
208
209protected:
210 unsigned char mapbuf[MAXMAPCHARLEN];
211 size_t mapbuflen;
212 size_t mapbufhere;
213
214 mapconvert converter;
215};
216
217
218// Simple input and output converter classes for use with 8 bit encodings
219// using simple textual map files. Map files should contain (at least) two
220// tab-separated fields. The first field is the mapped value and the second
221// field is the unicode value.
222
223struct ltus_t
224{
225 bool operator()(const unsigned short &t1, const unsigned short &t2) const
226 { return t1 < t2; }
227};
228
229
230class simplemapconvert {
231public:
232 simplemapconvert () {absentc=0; loaded=false;}
233 unsigned short convert (unsigned short c, bool in);
234 void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
235
236protected:
237 bool loadmapfile (bool in);
238
239 map <unsigned short, unsigned short, ltus_t> mapping;
240 bool loaded;
241 text_t mapfile;
242 unsigned short absentc;
243};
244
245
246class simplemapinconvertclass : public inconvertclass {
247public:
248 virtual ~simplemapinconvertclass () {}
249
250 void convert (text_t &output, status_t &status);
251
252 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
253
254protected:
255 simplemapconvert converter;
256};
257
258class simplemapoutconvertclass : public rzwsoutconvertclass {
259public:
260 virtual ~simplemapoutconvertclass () {}
261
262 void convert (char *output, size_t maxlen,
263 size_t &len, status_t &status);
264
265 void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
266
267protected:
268 simplemapconvert converter;
269};
270
271
272
273
274#endif
Note: See TracBrowser for help on using the repository browser.