source: trunk/gsdl/lib/gsdlunicode.h@ 1076

Last change on this file since 1076 was 1076, checked in by cs025, 24 years ago

Correcting a correction - reinstated all lib files due to silly
CVS confusion.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.h --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: gsdlunicode.h 1076 2000-04-06 19:58:04Z cs025 $
25 *
26 *********************************************************************/
27
28
29#ifndef GSDLUNICODE_H
30#define GSDLUNICODE_H
31
32#include "text_t.h"
33
34
35// converts a unicode encode text_t string to a utf-8
36// encoded text_t string
37text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end);
38inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());}
39
40// converts a utf-8 encoded text_t string to a unicode
41// encoded text_t string
42text_t to_uni (const text_t &in);
43
44
45
46#define MAXUTF8CHARLEN 3
47
48// convert from a utf-8 char stream to the text_t class
49class utf8inconvertclass : public inconvertclass {
50public:
51 utf8inconvertclass();
52 void reset ();
53 void convert (text_t &output, status_t &status);
54
55protected:
56 // buffer to hold unconverted characters in a stream
57 unsigned char utf8buf[MAXUTF8CHARLEN];
58 size_t utf8buflen;
59
60 // returns the length that the current contents of the
61 // utf8buf should be
62 size_t getutf8charlen ();
63};
64
65
66// This class provides the option of removing zero width
67// spaces (U+200B) during the output. By default this
68// option is turned off. The functionality is actually
69// implemented by the sub-classes, this class just provides
70// the framework for these classes.
71//
72// Note: by convention reset() should not reset the rzws flag.
73class rzwsoutconvertclass : public outconvertclass {
74public:
75 rzwsoutconvertclass () {rzws = 0;};
76 void set_rzws (int new_rzws) {rzws = new_rzws;};
77
78protected:
79 int rzws;
80};
81
82
83// Convert from a text_t class to a utf-8 char stream
84class utf8outconvertclass : public rzwsoutconvertclass {
85public:
86 utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;};
87 void reset ();
88 // note that convert does not null-terminate the
89 // output array of characters
90 void convert (char *output, size_t maxlen,
91 size_t &len, status_t &status);
92
93protected:
94 unsigned char utf8buf[MAXUTF8CHARLEN];
95 size_t utf8buflen;
96 size_t utf8bufhere;
97};
98
99
100// mapdata_t is used by mapconvert to hold the map file data
101class mapdata_t {
102public:
103 mapdata_t();
104 bool loaded;
105 unsigned short *ptrs[256];
106};
107
108// mapconvert is used in situations where conversion is best
109// done using a map file. The mapfile should reside in
110// gsdlhome/unicode.
111class mapconvert {
112public:
113 mapconvert ();
114 ~mapconvert () {unloadmapfile();};
115
116 // setmapfile will cause loadmapfile to be called when conversion is
117 // needed
118 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
119 unsigned short theabsentc);
120
121 // loadmapfile should be called before any conversion is done
122 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
123 unsigned short theabsentc);
124 void unloadmapfile ();
125
126 unsigned short convert (unsigned short c);
127
128 // note that this version of convert has different semantics to
129 // the convertclass version.
130 text_t convert (const text_t &instr);
131
132protected:
133 text_t gsdlhome;
134 text_t encoding;
135 unsigned short absentc;
136 mapdata_t mapdata;
137};
138
139
140
141#define MAXMAPCHARLEN 2
142
143// convert from a gb char stream to the unicode text_t class
144class mapinconvertclass : public inconvertclass {
145public:
146 mapinconvertclass();
147 virtual ~mapinconvertclass() {};
148
149 // setmapfile will cause loadmapfile to be called when conversion is needed
150 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
151 unsigned short theabsentc) {
152 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
153 };
154
155 // loadmapfile should be called before any conversion takes
156 // place
157 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
158 unsigned short theabsentc) {
159 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
160 };
161
162 void reset ();
163 void convert (text_t &output, status_t &status);
164
165protected:
166 // buffer to hold unconverted characters in a stream
167 unsigned char mapbuf[MAXMAPCHARLEN];
168 size_t mapbuflen;
169
170 // note: multiple instances of mapinconvert class are expensive
171 // as each will have its own copy of the map file data. This
172 // could be reduced by making map2unimap static, but then it
173 // wouldn't be thread safe.
174 mapconvert converter;
175
176 // returns the length that the current contents of the
177 // mapbuf should be
178 inline size_t getmapcharlen () {
179 if (mapbuflen == 0) return 0;
180 if (mapbuf[0] < 0x80) return 1;
181 return 2;
182 }
183};
184
185
186// Convert from a text_t class to a map char stream
187class mapoutconvertclass : public rzwsoutconvertclass {
188public:
189 mapoutconvertclass ();
190 virtual ~mapoutconvertclass() {};
191
192 // setmapfile will cause loadmapfile to be called when conversion is needed
193 bool setmapfile (const text_t &thegsdlhome, const text_t &theencoding,
194 unsigned short theabsentc) {
195 return converter.setmapfile (thegsdlhome, theencoding, theabsentc);
196 };
197
198 // loadmapfile should be called before any conversion takes
199 // place
200 bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
201 unsigned short theabsentc) {
202 return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
203 };
204
205 void reset ();
206 void convert (char *output, size_t maxlen,
207 size_t &len, status_t &status);
208
209protected:
210 unsigned char mapbuf[MAXMAPCHARLEN];
211 size_t mapbuflen;
212 size_t mapbufhere;
213
214 mapconvert converter;
215};
216
217#endif
Note: See TracBrowser for help on using the repository browser.