source: main/trunk/greenstone2/common-src/src/lib/text_t.h@ 24110

Last change on this file since 24110 was 24110, checked in by sjm84, 13 years ago

Added a trim function to text_t that removes leading and trailing whitespace

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1/**********************************************************************
2 *
3 * text_t.h -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.h 24110 2011-06-02 21:19:39Z sjm84 $
25 *
26 *********************************************************************/
27
28
29#ifndef TEXT_T_H
30#define TEXT_T_H
31
32#include "gsdlconf.h"
33
34#if defined(GSDL_USE_OBJECTSPACE)
35# include <ospace\std\vector>
36# include <ospace\std\list>
37# include <ospace\std\set>
38# include <ospace\std\map>
39#elif defined(GSDL_USE_STL_H)
40# include <vector.h>
41# include <list.h>
42# include <set.h>
43# include <map.h>
44#else
45# include <vector>
46# include <list>
47# include <set>
48# include <map>
49#endif
50#include <string>
51
52// use the standard namespace
53#if !defined (GSDL_NAMESPACE_BROKEN)
54#if defined(GSDL_USE_OBJECTSPACE)
55using namespace ospace::std;
56#else
57using namespace std;
58#endif
59#endif
60
61// class prototypes
62class text_t;
63class inconvertclass;
64class outconvertclass;
65
66
67// for those stupid compilers which need it
68#if defined(GSDL_NEED_DESTROY_USHORT)
69inline void destroy(unsigned short *) {};
70inline void destroy(int *) {};
71#endif
72
73typedef vector<unsigned short> usvector;
74extern const text_t g_EmptyText;
75
76// The class text_t can handle long strings which may contain
77// null characters. It uses unsigned shorts to represent up to
78// 64K character values.
79class text_t {
80 public:
81 //type support for ucvector
82 typedef usvector::iterator iterator;
83 typedef usvector::const_iterator const_iterator;
84 typedef usvector::reference reference;
85 typedef usvector::const_reference const_reference;
86 typedef usvector::size_type size_type;
87 typedef usvector::difference_type difference_type;
88 typedef usvector::const_reverse_iterator const_reverse_iterator;
89 typedef usvector::reverse_iterator reverse_iterator;
90
91protected:
92 usvector text;
93 unsigned short encoding; // 0 = unicode, 1 = other
94
95public:
96 // constructors
97 text_t ();
98 text_t (int i);
99 text_t (const char *s); // assumed to be a normal c string
100 text_t (const char *s, size_type nLength); // support for arrays of chars
101 void setencoding (unsigned short theencoding) {encoding=theencoding;};
102 unsigned short getencoding () {return encoding;};
103
104 usvector& text_as_usvector() { return text ; };
105 const usvector& text_as_usvector() const { return text ; };
106
107 // basic container support
108 iterator begin () {return text.begin();}
109 const_iterator begin () const {return text.begin();}
110 iterator end () {return text.end();}
111 const_iterator end () const {return text.end();}
112
113 void erase(iterator pos) {text.erase(pos);}
114 void erase(iterator first, iterator last) {text.erase(first, last);}
115 void push_back(unsigned short c) {text.push_back(c);}
116 void pop_back() {text.pop_back();}
117 text_t &operator=(const text_t &x) {text=x.text; encoding=x.encoding; return *this;}
118 reference operator[](size_type n) {return text[n];};
119 const_reference operator[](size_type n) const {return text[n];};
120
121 void reserve (size_type n) {text.reserve(n);}
122 size_type capacity() const { return text.capacity(); }
123
124 bool empty () const {return text.empty();}
125 size_type size() const {return text.size();}
126 friend inline bool operator!=(const text_t& x, const text_t& y)
127 {return (x.text != y.text);}
128 friend inline bool operator==(const text_t& x, const text_t& y)
129 {return (x.text == y.text);}
130 friend inline bool operator<(const text_t& x, const text_t& y)
131 {return (x.text < y.text);}
132 friend inline bool operator>(const text_t& x, const text_t& y)
133 {return (x.text > y.text);}
134 friend inline bool operator>=(const text_t& x, const text_t& y)
135 {return (x.text >= y.text);}
136 friend inline bool operator<=(const text_t& x, const text_t& y)
137 {return (x.text <= y.text);}
138
139 // added functionality
140 void clear () {text.erase(text.begin(),text.end());}
141 void append (const text_t &t);
142 void appendrange (iterator first, iterator last);
143 void appendrange (const_iterator first, const_iterator last);
144 text_t &operator+= (const text_t &t) {append(t);return *this;}
145
146 // support for integers
147 void appendint (int i);
148 void setint (int i) {clear();appendint(i);}
149 text_t &operator=(int i) {setint (i);return *this;}
150 text_t &operator+= (int i) {appendint(i);return *this;}
151 int getint () const;
152
153 // same as getint but returns an unsigned long
154 unsigned long getulong () const;
155
156 // support for arrays of chars
157 void appendcarr (const char *s, size_type len);
158 void setcarr (char *s, size_type len) {clear();appendcarr(s,len);}
159
160 // support for const null-terminated C strings
161 void appendcstr (const char *s);
162 void setcstr (const char *s) {clear();appendcstr(s);}
163 text_t &operator= (const char *s) {setcstr(s);return *this;} // c string
164 text_t &operator+= (const char *s) {appendcstr(s);return *this;} // c string
165
166 // strings returned from getcarr and getcstr become the callers
167 // responsibility and should be deallocated with "delete []"
168 char *getcarr(size_type &len) const;
169 char *getcstr() const;
170
171 int replace(text_t toreplace, text_t replacement);
172};
173
174// new stream converter ...
175ostream& operator<< (ostream &o, const text_t &text);
176
177inline text_t operator+(const text_t &t1, const text_t &t2)
178{
179 text_t tnew = t1;
180 tnew.append(t2);
181 return tnew;
182}
183
184inline text_t operator+(const text_t &t1, int i1)
185{
186 text_t tnew = t1;
187 tnew.appendint(i1);
188 return tnew;
189}
190
191inline text_t operator+(const text_t &t1, char *s1)
192{
193 text_t tnew = t1;
194 tnew.appendcstr(s1);
195 return tnew;
196}
197
198
199
200
201struct eqtext_t
202{
203 bool operator()(const text_t &t1, const text_t &t2) const
204 { return t1 == t2; }
205};
206
207struct lttext_t
208{
209 bool operator()(const text_t &t1, const text_t &t2) const
210 { return t1 < t2; }
211};
212
213
214// frequently used derived types
215typedef set<text_t,lttext_t> text_tset;
216typedef list<text_t> text_tlist; // more efficient for insertions/deletions
217typedef vector<text_t> text_tarray; // more space efficient than text_tlist
218typedef map<text_t, text_t, lttext_t> text_tmap;
219
220
221// general functions which work on text_ts
222
223// find a character within a range
224text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
225 unsigned short c);
226text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
227 unsigned short c);
228// Find the last occurrence of c between first and last_plus_one -1. Returns last_plus_one if not found.
229text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
230 unsigned short c);
231text_t::iterator findword (text_t::iterator first, text_t::iterator last,
232 const text_t &word);
233text_t::const_iterator findword (text_t::const_iterator first, text_t::const_iterator last,
234 const text_t& word);
235
236// get a string up to the next delimiter (which is skipped)
237text_t::const_iterator getdelimitstr (text_t::const_iterator first,
238 text_t::const_iterator last,
239 unsigned short c, text_t &outstr);
240text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
241 unsigned short c, text_t &outstr);
242
243text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
244 text_t w, text_t &outstr);
245
246// split a string with a character
247void splitchar (text_t::const_iterator first, text_t::const_iterator last,
248 unsigned short c, text_tset &outlist);
249void splitchar (text_t::const_iterator first, text_t::const_iterator last,
250 unsigned short c, text_tlist &outlist);
251void splitchar (text_t::const_iterator first, text_t::const_iterator last,
252 unsigned short c, text_tarray &outlist);
253
254void splitword (text_t::const_iterator first, text_t::const_iterator last,
255 text_t w, text_tlist &outlist);
256
257// join a string using a character
258void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext);
259void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext);
260void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext);
261void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext);
262void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext);
263void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext);
264
265// count the occurances of a character within a range
266int countchar (text_t::const_iterator first, text_t::const_iterator last,
267 unsigned short c);
268
269// return a substring of string from first up to but not including last
270text_t substr (text_t::const_iterator first, text_t::const_iterator last);
271
272// convert to lowercase
273void lc (text_t::iterator first, text_t::iterator last);
274inline void lc (text_t &t) {lc (t.begin(), t.end());}
275
276// convert to uppercase
277void uc (text_t::iterator first, text_t::iterator last);
278inline void uc (text_t &t) {uc (t.begin(), t.end());}
279
280// checks to see if it is a number (i.e. contains only 0-9)
281bool is_number (const text_t &text);
282
283// checks to see if the text has any letters or digits
284bool has_unicode_letdig (const text_t &text);
285
286// checks to see if a text_t starts with the specified prefix
287bool starts_with(const text_t& text, const text_t& prefix);
288// checks to see if a text_t ends with the specified suffix
289bool ends_with(const text_t& text, const text_t& suffix);
290
291// trims whitespace of the front and end of the string
292text_t trim(const text_t& text);
293
294// conversion classes used for getting information in to and out of
295// the text_t class.
296
297class convertclass
298{
299public:
300 enum status_t {finished, stopped, unfinished};
301
302 convertclass ();
303 virtual ~convertclass ();
304 virtual void reset ();
305};
306
307
308
309// convert from a char stream to the text_t class
310// the default version assumes the input is a ascii
311// character array
312class inconvertclass : public convertclass
313{
314public:
315 inconvertclass ();
316 ~inconvertclass ();
317
318 virtual void reset ();
319 void setinput (char *thestart, size_t thelen);
320
321 // output will be cleared before the conversion
322 virtual void convert (text_t &output, status_t &status);
323
324 // will treat the text_t as a 8-bit string and convert
325 // it to a 16-bit string using the about convert method.
326 text_t convert (const text_t &t);
327
328protected:
329 char *start;
330 size_t len;
331};
332
333// to get something which will do the conversion
334// to ascii declare a (non global!) instance like
335// this
336// inconvertclass ascii2text_t;
337
338#if defined(GSDL_USE_IOS_H)
339#include <iostream.h> // darwin doesn't have ostream.h...
340#else
341#include <ostream>
342#endif
343
344// Convert from a text_t class to a char stream
345// This default version assumes the output is a ascii
346// character array. If you set the output stream you
347// can use this class to output to a stream using the
348// << operator. The << operator can also be conveniently
349// used to set the output stream by doing something like
350//
351// cout << text_t2ascii << textstr << anothertextstr;
352//
353// this class assumes that the input text doesn't change
354// while the conversion takes place
355class outconvertclass : public convertclass
356{
357public:
358 outconvertclass ();
359 ~outconvertclass ();
360
361 virtual void reset ();
362 virtual void setinput (text_t *theinput);
363 virtual void setdata(text_t *input, text_t::iterator texthere);
364 // note that convert does not null-terminate the
365 // output array of characters
366 virtual void convert (char *output, size_t maxlen,
367 size_t &len, status_t &status);
368
369 // will convert the 16-bit string to a 8-bit stream
370 // and place the result in a text_t. This method uses
371 // the above convert function.
372 text_t convert (const text_t &t);
373
374 virtual void setostream (ostream *theouts);
375 ostream *getostream ();
376
377protected:
378 text_t *input;
379 text_t::iterator texthere; // only valid if input is valid
380
381 ostream *outs;
382};
383
384// to get something which will do the conversion
385// to text_t declare a (non global!) instance like
386// this
387// outconvertclass text_t2ascii;
388
389
390// stream operators for the output class
391outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter);
392outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t);
393
394
395#endif
Note: See TracBrowser for help on using the repository browser.