/********************************************************************** * * text_t.h -- a simple 16-bit character string class * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #ifndef TEXT_T_H #define TEXT_T_H #include "gsdlconf.h" #if defined(GSDL_USE_OBJECTSPACE) # include # include # include # include #elif defined(GSDL_USE_STL_H) # include # include # include # include #else # include # include # include # include #endif // use the standard namespace #if !defined (GSDL_NAMESPACE_BROKEN) #if defined(GSDL_USE_OBJECTSPACE) using namespace ospace::std; #else using namespace std; #endif #endif // class prototypes class inconvertclass; class outconvertclass; // for those stupid compilers which need it #if defined(GSDL_NEED_DESTROY_USHORT) inline void destroy(unsigned short *) {}; inline void destroy(int *) {}; #endif typedef vector usvector; // The class text_t can handle long strings which may contain // null characters. It uses unsigned shorts to represent up to // 64K character values. class text_t { public: //type support for ucvector typedef usvector::iterator iterator; typedef usvector::const_iterator const_iterator; typedef usvector::reference reference; typedef usvector::const_reference const_reference; typedef usvector::size_type size_type; typedef usvector::difference_type difference_type; typedef usvector::const_reverse_iterator const_reverse_iterator; typedef usvector::reverse_iterator reverse_iterator; protected: usvector text; unsigned short encoding; // 0 = unicode, 1 = other public: // constructors text_t (); text_t (int i); text_t (char *s); // assumed to be a normal c string void setencoding (unsigned short theencoding) {encoding=theencoding;}; unsigned short getencoding () {return encoding;}; // basic container support iterator begin () {return text.begin();} const_iterator begin () const {return text.begin();} iterator end () {return text.end();} const_iterator end () const {return text.end();} void erase(iterator pos) {text.erase(pos);} void erase(iterator first, iterator last) {text.erase(first, last);} void push_back(unsigned short c) {text.push_back(c);} void pop_back() {text.pop_back();} text_t &operator=(const text_t &x) {text=x.text; encoding=x.encoding; return *this;} reference operator[](size_type n) {return text[n];}; const_reference operator[](size_type n) const {return text[n];}; void reserve (size_type n) {text.reserve(n);} bool empty () const {return text.empty();} size_type size() const {return text.size();} friend inline bool operator!=(const text_t& x, const text_t& y) {return (x.text != y.text);} friend inline bool operator==(const text_t& x, const text_t& y) {return (x.text == y.text);} friend inline bool operator<(const text_t& x, const text_t& y) {return (x.text < y.text);} friend inline bool operator>(const text_t& x, const text_t& y) {return (x.text > y.text);} friend inline bool operator>=(const text_t& x, const text_t& y) {return (x.text >= y.text);} friend inline bool operator<=(const text_t& x, const text_t& y) {return (x.text <= y.text);} // added functionality void clear () {text.erase(text.begin(),text.end());} void append (const text_t &t); void appendrange (iterator first, iterator last); void appendrange (const_iterator first, const_iterator last); text_t &operator+= (const text_t &t) {append(t);return *this;} // support for integers void appendint (int i); void setint (int i) {clear();appendint(i);} text_t &operator=(int i) {setint (i);return *this;} text_t &operator+= (int i) {appendint(i);return *this;} int getint () const; // support for arrays of chars void appendcarr (char *s, size_type len); void setcarr (char *s, size_type len) {clear();appendcarr(s,len);} // support for null-terminated C strings void appendcstr (char *s); void setcstr (char *s) {clear();appendcstr(s);} text_t &operator= (char *s) {setcstr(s);return *this;} // c string text_t &operator+= (char *s) {appendcstr(s);return *this;} // c string // support for const null-terminated C string void appendcstr (const char *s) {appendcstr((char *)s);} void setcstr (const char *s) {clear();appendcstr((char *) s);} text_t &operator= (const char *s) {setcstr((char *) s);return *this;} // c string text_t &operator+= (const char *s) {appendcstr((char *) s);return *this;} // c string // strings returned from getcarr and getcstr become the callers // responsibility and should be deallocated with "delete" char *getcarr(size_type &len) const; char *getcstr() const; }; inline text_t operator+(const text_t &t1, const text_t &t2) { text_t tnew = t1; tnew.append(t2); return tnew; } inline text_t operator+(const text_t &t1, int i1) { text_t tnew = t1; tnew.appendint(i1); return tnew; } inline text_t operator+(const text_t &t1, char *s1) { text_t tnew = t1; tnew.appendcstr(s1); return tnew; } struct eqtext_t { bool operator()(const text_t &t1, const text_t &t2) const { return t1 == t2; } }; struct lttext_t { bool operator()(const text_t &t1, const text_t &t2) const { return t1 < t2; } }; // frequently used derived types typedef set text_tset; typedef list text_tlist; // more efficient for insertions/deletions typedef vector text_tarray; // more space efficient than text_tlist typedef map text_tmap; // general functions which work on text_ts // find a character within a range text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c); text_t::iterator findchar (text_t::iterator first, text_t::iterator last, unsigned short c); // get a string up to the next delimiter (which is skipped) text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_t &outstr); text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last, unsigned short c, text_t &outstr); // split a string with a character void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tset &outlist); void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tlist &outlist); void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tarray &outlist); // join a string using a character void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext); void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext); void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext); void joinchar (const text_tset &inlist, text_t c, text_t &outtext); void joinchar (const text_tlist &inlist, text_t c, text_t &outtext); void joinchar (const text_tarray &inlist, text_t c, text_t &outtext); // count the occurances of a character within a range int countchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c); // return a substring of string from first up to but not including last text_t substr (text_t::const_iterator first, text_t::const_iterator last); // convert to lowercase void lc (text_t::iterator first, text_t::iterator last); inline void lc (text_t &t) {lc (t.begin(), t.end());} // convert to uppercase void uc (text_t::iterator first, text_t::iterator last); inline void uc (text_t &t) {uc (t.begin(), t.end());} // checks to see if it is a number (i.e. contains only 0-9) bool is_number (const text_t &text); // checks to see if the text has any letters or digits bool has_unicode_letdig (const text_t &text); // conversion classes used for getting information in to and out of // the text_t class. class convertclass { public: enum status_t {finished, stopped, unfinished}; convertclass (); virtual void reset (); }; // convert from a char stream to the text_t class // the default version assumes the input is a ascii // character array class inconvertclass : public convertclass { public: inconvertclass (); void reset (); void setinput (char *thestart, size_t thelen); // output will be cleared before the conversion virtual void convert (text_t &output, status_t &status); // will treat the text_t as a 8-bit string and convert // it to a 16-bit string using the about convert method. text_t convert (const text_t &t); protected: char *start; size_t len; }; // to get something which will do the conversion // to ascii declare a (non global!) instance like // this // inconvertclass ascii2text_t; #if defined(GSDL_USE_IOS_H) #include #else #include #endif // Convert from a text_t class to a char stream // This default version assumes the output is a ascii // character array. If you set the output stream you // can use this class to output to a stream using the // << operator. The << operator can also be conveniently // used to set the output stream by doing something like // // cout << text_t2ascii << textstr << anothertextstr; // // this class assumes that the input text doesn't change // while the conversion takes place class outconvertclass : public convertclass { public: outconvertclass (); void reset (); void setinput (text_t *theinput); // note that convert does not null-terminate the // output array of characters virtual void convert (char *output, size_t maxlen, size_t &len, status_t &status); // will convert the 16-bit string to a 8-bit stream // and place the result in a text_t. This method uses // the above convert function. text_t convert (const text_t &t); void setostream (ostream *theouts); ostream *getostream (); protected: text_t *input; text_t::iterator texthere; // only valid if input is valid ostream *outs; }; // to get something which will do the conversion // to text_t declare a (non global!) instance like // this // outconvertclass text_t2ascii; // stream operators for the output class outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter); outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t); #endif