source: gsdl/trunk/common-src/src/lib/text_t.h@ 18700

Last change on this file since 18700 was 18700, checked in by kjdon, 15 years ago

findlastchar was dereferencing the last iterator (end()), so now it decrements that before dereferencing. parameter name changed to last_plus_one to hopefully inform user of this.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.4 KB
Line 
1/**********************************************************************
2 *
3 * text_t.h -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.h 18700 2009-03-17 01:31:58Z kjdon $
25 *
26 *********************************************************************/
27
28
29#ifndef TEXT_T_H
30#define TEXT_T_H
31
32#include "gsdlconf.h"
33
34#if defined(GSDL_USE_OBJECTSPACE)
35# include <ospace\std\vector>
36# include <ospace\std\list>
37# include <ospace\std\set>
38# include <ospace\std\map>
39#elif defined(GSDL_USE_STL_H)
40# include <vector.h>
41# include <list.h>
42# include <set.h>
43# include <map.h>
44#else
45# include <vector>
46# include <list>
47# include <set>
48# include <map>
49#endif
50
51// use the standard namespace
52#if !defined (GSDL_NAMESPACE_BROKEN)
53#if defined(GSDL_USE_OBJECTSPACE)
54using namespace ospace::std;
55#else
56using namespace std;
57#endif
58#endif
59
60// class prototypes
61class text_t;
62class inconvertclass;
63class outconvertclass;
64
65
66// for those stupid compilers which need it
67#if defined(GSDL_NEED_DESTROY_USHORT)
68inline void destroy(unsigned short *) {};
69inline void destroy(int *) {};
70#endif
71
72typedef vector<unsigned short> usvector;
73extern const text_t g_EmptyText;
74
75// The class text_t can handle long strings which may contain
76// null characters. It uses unsigned shorts to represent up to
77// 64K character values.
78class text_t {
79 public:
80 //type support for ucvector
81 typedef usvector::iterator iterator;
82 typedef usvector::const_iterator const_iterator;
83 typedef usvector::reference reference;
84 typedef usvector::const_reference const_reference;
85 typedef usvector::size_type size_type;
86 typedef usvector::difference_type difference_type;
87 typedef usvector::const_reverse_iterator const_reverse_iterator;
88 typedef usvector::reverse_iterator reverse_iterator;
89
90protected:
91 usvector text;
92 unsigned short encoding; // 0 = unicode, 1 = other
93
94public:
95 // constructors
96 text_t ();
97 text_t (int i);
98 text_t (const char *s); // assumed to be a normal c string
99 text_t (const char *s, size_type nLength); // support for arrays of chars
100 void setencoding (unsigned short theencoding) {encoding=theencoding;};
101 unsigned short getencoding () {return encoding;};
102
103 usvector& text_as_usvector() { return text ; };
104 const usvector& text_as_usvector() const { return text ; };
105
106 // basic container support
107 iterator begin () {return text.begin();}
108 const_iterator begin () const {return text.begin();}
109 iterator end () {return text.end();}
110 const_iterator end () const {return text.end();}
111
112 void erase(iterator pos) {text.erase(pos);}
113 void erase(iterator first, iterator last) {text.erase(first, last);}
114 void push_back(unsigned short c) {text.push_back(c);}
115 void pop_back() {text.pop_back();}
116 text_t &operator=(const text_t &x) {text=x.text; encoding=x.encoding; return *this;}
117 reference operator[](size_type n) {return text[n];};
118 const_reference operator[](size_type n) const {return text[n];};
119
120 void reserve (size_type n) {text.reserve(n);}
121 size_type capacity() const { return text.capacity(); }
122
123 bool empty () const {return text.empty();}
124 size_type size() const {return text.size();}
125 friend inline bool operator!=(const text_t& x, const text_t& y)
126 {return (x.text != y.text);}
127 friend inline bool operator==(const text_t& x, const text_t& y)
128 {return (x.text == y.text);}
129 friend inline bool operator<(const text_t& x, const text_t& y)
130 {return (x.text < y.text);}
131 friend inline bool operator>(const text_t& x, const text_t& y)
132 {return (x.text > y.text);}
133 friend inline bool operator>=(const text_t& x, const text_t& y)
134 {return (x.text >= y.text);}
135 friend inline bool operator<=(const text_t& x, const text_t& y)
136 {return (x.text <= y.text);}
137
138 // added functionality
139 void clear () {text.erase(text.begin(),text.end());}
140 void append (const text_t &t);
141 void appendrange (iterator first, iterator last);
142 void appendrange (const_iterator first, const_iterator last);
143 text_t &operator+= (const text_t &t) {append(t);return *this;}
144
145 // support for integers
146 void appendint (int i);
147 void setint (int i) {clear();appendint(i);}
148 text_t &operator=(int i) {setint (i);return *this;}
149 text_t &operator+= (int i) {appendint(i);return *this;}
150 int getint () const;
151
152 // same as getint but returns an unsigned long
153 unsigned long getulong () const;
154
155 // support for arrays of chars
156 void appendcarr (const char *s, size_type len);
157 void setcarr (char *s, size_type len) {clear();appendcarr(s,len);}
158
159 // support for const null-terminated C strings
160 void appendcstr (const char *s);
161 void setcstr (const char *s) {clear();appendcstr(s);}
162 text_t &operator= (const char *s) {setcstr(s);return *this;} // c string
163 text_t &operator+= (const char *s) {appendcstr(s);return *this;} // c string
164
165 // strings returned from getcarr and getcstr become the callers
166 // responsibility and should be deallocated with "delete []"
167 char *getcarr(size_type &len) const;
168 char *getcstr() const;
169
170 int replace(text_t toreplace, text_t replacement);
171};
172
173// new stream converter ...
174ostream& operator<< (ostream &o, const text_t &text);
175
176inline text_t operator+(const text_t &t1, const text_t &t2)
177{
178 text_t tnew = t1;
179 tnew.append(t2);
180 return tnew;
181}
182
183inline text_t operator+(const text_t &t1, int i1)
184{
185 text_t tnew = t1;
186 tnew.appendint(i1);
187 return tnew;
188}
189
190inline text_t operator+(const text_t &t1, char *s1)
191{
192 text_t tnew = t1;
193 tnew.appendcstr(s1);
194 return tnew;
195}
196
197
198
199
200struct eqtext_t
201{
202 bool operator()(const text_t &t1, const text_t &t2) const
203 { return t1 == t2; }
204};
205
206struct lttext_t
207{
208 bool operator()(const text_t &t1, const text_t &t2) const
209 { return t1 < t2; }
210};
211
212
213// frequently used derived types
214typedef set<text_t,lttext_t> text_tset;
215typedef list<text_t> text_tlist; // more efficient for insertions/deletions
216typedef vector<text_t> text_tarray; // more space efficient than text_tlist
217typedef map<text_t, text_t, lttext_t> text_tmap;
218
219
220// general functions which work on text_ts
221
222// find a character within a range
223text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
224 unsigned short c);
225text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
226 unsigned short c);
227// Find the last occurrence of c between first and last_plus_one -1. Returns last_plus_one if not found.
228text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
229 unsigned short c);
230text_t::iterator findword (text_t::iterator first, text_t::iterator last,
231 const text_t &word);
232text_t::const_iterator findword (text_t::const_iterator first, text_t::const_iterator last,
233 const text_t& word);
234
235// get a string up to the next delimiter (which is skipped)
236text_t::const_iterator getdelimitstr (text_t::const_iterator first,
237 text_t::const_iterator last,
238 unsigned short c, text_t &outstr);
239text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
240 unsigned short c, text_t &outstr);
241
242text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
243 text_t w, text_t &outstr);
244
245// split a string with a character
246void splitchar (text_t::const_iterator first, text_t::const_iterator last,
247 unsigned short c, text_tset &outlist);
248void splitchar (text_t::const_iterator first, text_t::const_iterator last,
249 unsigned short c, text_tlist &outlist);
250void splitchar (text_t::const_iterator first, text_t::const_iterator last,
251 unsigned short c, text_tarray &outlist);
252
253void splitword (text_t::const_iterator first, text_t::const_iterator last,
254 text_t w, text_tlist &outlist);
255
256// join a string using a character
257void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext);
258void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext);
259void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext);
260void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext);
261void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext);
262void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext);
263
264// count the occurances of a character within a range
265int countchar (text_t::const_iterator first, text_t::const_iterator last,
266 unsigned short c);
267
268// return a substring of string from first up to but not including last
269text_t substr (text_t::const_iterator first, text_t::const_iterator last);
270
271// convert to lowercase
272void lc (text_t::iterator first, text_t::iterator last);
273inline void lc (text_t &t) {lc (t.begin(), t.end());}
274
275// convert to uppercase
276void uc (text_t::iterator first, text_t::iterator last);
277inline void uc (text_t &t) {uc (t.begin(), t.end());}
278
279// checks to see if it is a number (i.e. contains only 0-9)
280bool is_number (const text_t &text);
281
282// checks to see if the text has any letters or digits
283bool has_unicode_letdig (const text_t &text);
284
285// checks to see if a text_t starts with the specified prefix
286bool starts_with(const text_t& text, const text_t& prefix);
287// checks to see if a text_t ends with the specified suffix
288bool ends_with(const text_t& text, const text_t& suffix);
289
290// conversion classes used for getting information in to and out of
291// the text_t class.
292
293class convertclass
294{
295public:
296 enum status_t {finished, stopped, unfinished};
297
298 convertclass ();
299 virtual void reset ();
300};
301
302
303
304// convert from a char stream to the text_t class
305// the default version assumes the input is a ascii
306// character array
307class inconvertclass : public convertclass
308{
309public:
310 inconvertclass ();
311 void reset ();
312 void setinput (char *thestart, size_t thelen);
313
314 // output will be cleared before the conversion
315 virtual void convert (text_t &output, status_t &status);
316
317 // will treat the text_t as a 8-bit string and convert
318 // it to a 16-bit string using the about convert method.
319 text_t convert (const text_t &t);
320
321protected:
322 char *start;
323 size_t len;
324};
325
326// to get something which will do the conversion
327// to ascii declare a (non global!) instance like
328// this
329// inconvertclass ascii2text_t;
330
331#if defined(GSDL_USE_IOS_H)
332#include <iostream.h> // darwin doesn't have ostream.h...
333#else
334#include <ostream>
335#endif
336
337// Convert from a text_t class to a char stream
338// This default version assumes the output is a ascii
339// character array. If you set the output stream you
340// can use this class to output to a stream using the
341// << operator. The << operator can also be conveniently
342// used to set the output stream by doing something like
343//
344// cout << text_t2ascii << textstr << anothertextstr;
345//
346// this class assumes that the input text doesn't change
347// while the conversion takes place
348class outconvertclass : public convertclass
349{
350public:
351 outconvertclass ();
352 virtual void reset ();
353 virtual void setinput (text_t *theinput);
354 virtual void setdata(text_t *input, text_t::iterator texthere);
355 // note that convert does not null-terminate the
356 // output array of characters
357 virtual void convert (char *output, size_t maxlen,
358 size_t &len, status_t &status);
359
360 // will convert the 16-bit string to a 8-bit stream
361 // and place the result in a text_t. This method uses
362 // the above convert function.
363 text_t convert (const text_t &t);
364
365 virtual void setostream (ostream *theouts);
366 ostream *getostream ();
367
368protected:
369 text_t *input;
370 text_t::iterator texthere; // only valid if input is valid
371
372 ostream *outs;
373};
374
375// to get something which will do the conversion
376// to text_t declare a (non global!) instance like
377// this
378// outconvertclass text_t2ascii;
379
380
381// stream operators for the output class
382outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter);
383outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t);
384
385
386#endif
Note: See TracBrowser for help on using the repository browser.