root/trunk/gsdl/lib/gsdlunicode.h @ 1870

Revision 1870, 7.2 KB (checked in by sjboddie, 19 years ago)

Tidied up language support stuff.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#ifndef GSDLUNICODE_H
28#define GSDLUNICODE_H
29
30#include "text_t.h"
31
32
33// converts a unicode encode text_t string to a utf-8
34// encoded text_t string
35text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
36inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
37
38// converts a utf-8 encoded text_t string to a unicode
39// encoded text_t string
40text_t to_uni (const text_t &in);
41
42
43
44#define MAXUTF8CHARLEN 3
45
46// convert from a utf-8 char stream to the text_t class
47class utf8inconvertclass : public inconvertclass {
48public:
49  utf8inconvertclass();
50  void reset ();
51  void convert (text_t &output, status_t &status);
52
53protected:
54  // buffer to hold unconverted characters in a stream
55  unsigned char utf8buf[MAXUTF8CHARLEN];
56  size_t utf8buflen;
57
58  // returns the length that the current contents of the
59  // utf8buf should be
60  size_t getutf8charlen ();
61};
62
63
64// This class provides the option of removing zero width
65// spaces (U+200B) during the output. By default this
66// option is turned off. The functionality is actually
67// implemented by the sub-classes, this class just provides
68// the framework for these classes.
69//
70// Note: by convention reset() should not reset the rzws flag.
71class rzwsoutconvertclass : public outconvertclass {
72public:
73  rzwsoutconvertclass () {rzws = 0;};
74  void set_rzws (int new_rzws) {rzws = new_rzws;};
75
76protected:
77  int rzws;
78};
79
80
81// Convert from a text_t class to a utf-8 char stream
82class utf8outconvertclass : public rzwsoutconvertclass {
83public:
84  utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
85  void reset ();
86  // note that convert does not null-terminate the
87  // output array of characters
88  void convert (char *output, size_t maxlen,
89        size_t &len, status_t &status);
90
91protected:
92  unsigned char utf8buf[MAXUTF8CHARLEN];
93  size_t utf8buflen;
94  size_t utf8bufhere;
95};
96
97
98// mapdata_t is used by mapconvert to hold the map file data
99class mapdata_t {
100public:
101  mapdata_t();
102  bool loaded;
103  unsigned short *ptrs[256];
104};
105
106// mapconvert is used in situations where conversion is best
107// done using a map file. The mapfile should reside in
108// gsdlhome/unicode.
109class mapconvert {
110public:
111  mapconvert ();
112  ~mapconvert () {unloadmapfile();};
113
114  // setmapfile will cause loadmapfile to be called when conversion is
115  // needed
116  bool setmapfile (const text_t &themapfile, unsigned short theabsentc);
117
118  // loadmapfile should be called before any conversion is done
119  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc);
120  void unloadmapfile ();
121
122  unsigned short convert (unsigned short c);
123
124  // note that this version of convert has different semantics to
125  // the convertclass version.
126  text_t convert (const text_t &instr);
127
128protected:
129  text_t mapfile;
130  unsigned short absentc;
131  mapdata_t mapdata;
132};
133
134
135
136#define MAXMAPCHARLEN 2
137
138// convert from a gb char stream to the unicode text_t class
139class mapinconvertclass : public inconvertclass {
140public:
141  mapinconvertclass();
142  virtual ~mapinconvertclass() {};
143
144  // setmapfile will cause loadmapfile to be called when conversion is needed
145  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
146    return converter.setmapfile (themapfile, theabsentc);
147  };
148
149  // loadmapfile should be called before any conversion takes
150  // place
151  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
152    return converter.loadmapfile (themapfile, theabsentc);
153  };
154
155  void reset ();
156  void convert (text_t &output, status_t &status);
157
158protected:
159  // buffer to hold unconverted characters in a stream
160  unsigned char mapbuf[MAXMAPCHARLEN];
161  size_t mapbuflen;
162
163  // note: multiple instances of mapinconvert class are expensive
164  // as each will have its own copy of the map file data. This
165  // could be reduced by making map2unimap static, but then it
166  // wouldn't be thread safe.
167  mapconvert converter;
168
169  // returns the length that the current contents of the
170  // mapbuf should be
171  inline size_t getmapcharlen () {
172    if (mapbuflen == 0) return 0;
173    if (mapbuf[0] < 0x80) return 1;
174    return 2;
175  }
176};
177
178
179// Convert from a text_t class to a map char stream
180class mapoutconvertclass : public rzwsoutconvertclass {
181public:
182  mapoutconvertclass ();
183  virtual ~mapoutconvertclass() {};
184
185  // setmapfile will cause loadmapfile to be called when conversion is needed
186  bool setmapfile (const text_t &themapfile, unsigned short theabsentc) {
187    return converter.setmapfile (themapfile, theabsentc);
188  };
189
190  // loadmapfile should be called before any conversion takes
191  // place
192  bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) {
193    return converter.loadmapfile (themapfile, theabsentc);
194  };
195
196  void reset ();
197  void convert (char *output, size_t maxlen,
198        size_t &len, status_t &status);
199
200protected:
201  unsigned char mapbuf[MAXMAPCHARLEN];
202  size_t mapbuflen;
203  size_t mapbufhere;
204
205  mapconvert converter;
206};
207
208
209// Simple input and output converter classes for use with 8 bit encodings
210// using simple textual map files. Map files should contain (at least) two
211// tab-separated fields. The first field is the mapped value and the second
212// field is the unicode value.
213
214struct ltus_t
215{
216  bool operator()(const unsigned short &t1, const unsigned short &t2) const
217  { return t1 < t2; }
218};
219
220
221class simplemapconvert {
222public:
223  simplemapconvert () {absentc=0; loaded=false;}
224  unsigned short convert (unsigned short c, bool in);
225  void setmapfile (const text_t &themapfile) {mapfile = themapfile;}
226
227protected:
228  bool loadmapfile (bool in);
229
230  map <unsigned short, unsigned short, ltus_t> mapping;
231  bool loaded;
232  text_t mapfile;
233  unsigned short absentc;
234};
235
236
237class simplemapinconvertclass : public inconvertclass {
238public:
239  virtual ~simplemapinconvertclass () {}
240
241  void convert (text_t &output, status_t &status);
242
243  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
244 
245protected:
246  simplemapconvert converter;
247};
248
249class simplemapoutconvertclass : public rzwsoutconvertclass {
250public:
251  virtual ~simplemapoutconvertclass () {}
252
253  void convert (char *output, size_t maxlen,
254        size_t &len, status_t &status);
255
256  void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);}
257 
258protected:
259  simplemapconvert converter;
260};
261
262
263
264
265#endif
Note: See TracBrowser for help on using the browser.