/********************************************************************** * * text_t.cpp -- a simple 16-bit character string class * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: text_t.cpp 18821 2009-03-26 23:16:45Z mdewsnip $ * *********************************************************************/ #include "text_t.h" #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_STL_H) # if defined(GSDL_USE_ALGO_H) # include # else # include # endif #else # include #endif #ifdef HAVE_CONFIG_H # ifdef __WIN32__ # include "win32cfg.h" # else # include "config.h" # endif #endif #include "unitool.h" const text_t g_EmptyText(""); //////////////////////////////////// // text_t methods //////////////////////////////////// // new stream converter ... ostream& operator<< (ostream &o, const text_t &text) { text_t::const_iterator ithere = text.begin(); text_t::const_iterator itend = text.end(); while (ithere != itend) { if (*ithere < 256) { o << (unsigned char)(*ithere); } else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*ithere)) o << ' '; else o << '?'; } ++ithere; } return o; } text_t::text_t () { setencoding(0); clear (); } text_t::text_t (int i) { setencoding(0); clear (); appendint (i); } text_t::text_t (const char *s) { setencoding(0); clear (); appendcstr (s); } text_t::text_t (const char *s, size_type nLength) { setencoding(0); clear (); appendcarr(s, nLength); } void text_t::append (const text_t &t) { text.insert(text.end(), t.begin(), t.end()); } void text_t::appendrange (iterator first, iterator last) { text.insert(text.end(), first, last); } void text_t::appendrange (const_iterator first, const_iterator last) { text.insert(text.end(), first, last); } void text_t::appendint (int i) { // deal with zeros and negatives if (i == 0) { text.push_back('0'); return; } else if (i < 0) { text.push_back('-'); i *= -1; } // get a buffer for the conversion int maxbuflen = sizeof(int)*3; char *buf = new char[maxbuflen]; int len = 0; // get the number in reverse while (i > 0) { buf[len++] = '0'+ (i%10); i = i/10; } // reverse the number while (len > 0) { text.push_back(buf[--len]); } delete []buf; } int text_t::getint () const { int i = 0; int mult = 1; // become -1 for negative numbers const_iterator here = text.begin(); const_iterator end = text.end(); // do plus and minus signs if (here != end) { if (*here == '-') { mult = -1; ++here; } else if (*here == '+') { mult = 1; ++here; } } // deal with the number while ((here != end) && (*here >= '0') && (*here <= '9')) { i = 10*i + (*here - '0'); ++here; } i *= mult; return i; } unsigned long text_t::getulong () const { unsigned long i = 0; const_iterator here = text.begin(); const_iterator end = text.end(); while ((here != end) && (*here >= '0') && (*here <= '9')) { i = 10*i + (*here - '0'); ++here; } return i; } void text_t::appendcarr (const char *s, size_type len) { unsigned char *us = (unsigned char *)s; if (text.capacity() < (text.size() + len + 2)) { text.reserve(text.size() + len + 2); } while (len > 0) { text.push_back (*us); // append this character ++us; --len; } } void text_t::appendcstr (const char *s) { size_t len = strlen(s); if (text.capacity() < (text.size() + len + 2)) { text.reserve(text.size() + len + 2); } unsigned char *us = (unsigned char *)s; while (*us != '\0') { text.push_back (*us); // append this character ++us; } } // strings returned from getcarr and getcstr become the callers // responsibility and should be deallocated with "delete []" char *text_t::getcarr(size_type &len) const { unsigned char *cstr = new unsigned char[size()]; len = 0; const_iterator ithere = begin(); const_iterator itend = end(); while (ithere != itend) { if (*ithere < 256) cstr[len] = (unsigned char)(*ithere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*ithere)) cstr[len] = ' '; else cstr[len] = '?'; } ++len; ++ithere; } return (char *)cstr; } char *text_t::getcstr() const { unsigned char *cstr = new unsigned char[size() + 1]; const_iterator ithere = begin(); const_iterator itend = end(); int len = 0; while (ithere != itend) { if (*ithere < 256) cstr[len] = (unsigned char)(*ithere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*ithere)) cstr[len] = ' '; else cstr[len] = '?'; } ++len; ++ithere; } cstr[len] = '\0'; return (char *)cstr; } int text_t::replace(text_t toreplace, text_t replacement) { // Get the beginning and end of the current text text_t::iterator text_begin = text.begin(), text_end = text.end(); int count = 0; text_t new_text, temp_text; // Loop through and grab the text off the end while (text_begin < text_end) { // Find where the next toreplace is text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace); // We've found a match if (next_toreplace != text_end) { new_text.append(substr(text_begin, next_toreplace)); new_text.append(replacement); count++; text_begin = next_toreplace + toreplace.size(); } // We haven't found a match else { new_text.append(substr(text_begin, text_end)); text_begin = text_end; } } text.clear(); text = new_text.text_as_usvector(); return count; } // general functions which work on text_ts // find a character within a range text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c) { while (first != last) { if (*first == c) break; ++first; } return first; } text_t::iterator findchar (text_t::iterator first, text_t::iterator last, unsigned short c) { while (first != last) { if (*first == c) break; ++first; } return first; } text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one, unsigned short c) { text_t::iterator current = (last_plus_one != first) ? last_plus_one - 1 : first; while (current != first) { if (*current == c) break; --current; } if (current == first) { if (*current == c) return current; return last_plus_one; } return current; } text_t::const_iterator findword (text_t::const_iterator first, text_t::const_iterator last, const text_t& word) { text_t::const_iterator word_begin = word.begin(); text_t::const_iterator word_end = word.end(); while (first != last) { text_t::const_iterator char_match = first; text_t::const_iterator word_here = word_begin; while (word_here!=word_end) { if (*char_match != *word_here) { break; } ++char_match; ++word_here; } if (word_here==word_end) { return first; } ++first; } return last; // get to here only if there is no match } text_t::iterator findword (text_t::iterator first, text_t::iterator last, const text_t& word) { text_t::const_iterator word_begin = word.begin(); text_t::const_iterator word_end = word.end(); while (first != last) { text_t::iterator char_match = first; text_t::const_iterator word_here = word_begin; while (word_here!=word_end) { if (*char_match != *word_here) { break; } ++char_match; ++word_here; } if (word_here==word_end) { return first; } ++first; } return last; // get to here only if there is no match } // get a string up to the next delimiter (which is skipped) text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_t &outstr) { text_t::const_iterator here = first; here = findchar (first, last, c); outstr.clear(); outstr.appendrange (first, here); if (here != last) ++here; // skip c return here; } text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last, unsigned short c, text_t &outstr) { text_t::iterator here = first; here = findchar (first, last, c); outstr.clear(); outstr.appendrange (first, here); if (here != last) ++here; // skip c return here; } text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last, text_t w, text_t &outstr) { text_t::const_iterator here = first; here = findword (first, last, w); outstr.clear(); outstr.appendrange (first, here); if (here != last) here += w.size(); // skip w return here; } // split a string with a character void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tset &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.insert (t); } } void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tlist &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.push_back (t); } } void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tarray &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.push_back (t); } } void splitword (text_t::const_iterator first, text_t::const_iterator last, text_t w, text_tlist &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, w, t); outlist.push_back (t); } } // join a string using a character void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tset::const_iterator here = inlist.begin (); text_tset::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext.push_back (c); outtext += *here; ++here; } } } void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tlist::const_iterator here = inlist.begin (); text_tlist::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext.push_back (c); outtext += *here; ++here; } } } void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tarray::const_iterator here = inlist.begin (); text_tarray::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext.push_back (c); outtext += *here; ++here; } } } void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext) { outtext.clear (); text_tlist::const_iterator here = inlist.begin (); text_tlist::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext += c; outtext += *here; ++here; } } } void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext) { outtext.clear (); text_tset::const_iterator here = inlist.begin (); text_tset::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext += c; outtext += *here; ++here; } } } void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext) { outtext.clear (); text_tarray::const_iterator here = inlist.begin (); text_tarray::const_iterator end = inlist.end (); if (here != end) { outtext += *here; ++here; while (here != end) { outtext += c; outtext += *here; ++here; } } } // count the occurances of a character within a range int countchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c) { int count = 0; while (first != last) { if (*first == c) ++count; ++first; } return count; } // return a substring of string from first up to but not including last text_t substr (text_t::const_iterator first, text_t::const_iterator last) { text_t substr; substr.reserve(last - first + 2); while (first != last) { substr.push_back(*first); ++first; } return substr; } // convert to lowercase void lc (text_t::iterator first, text_t::iterator last) { while (first != last) { *first = unicode_tolower(*first); ++first; } } // convert to uppercase void uc (text_t::iterator first, text_t::iterator last) { while (first != last) { *first = unicode_toupper(*first); ++first; } } // checks to see if it is a number (i.e. contains only 0-9) bool is_number (const text_t &text) { text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); while (here != end) { if ((*here!='0') && (*here!='1') && (*here!='2') && (*here!='3') && (*here!='4') && (*here!='5') && (*here!='6') && (*here!='7') && (*here!='8') && (*here!='9')) return false; ++here; } return true; } // checks to see if the text has any letters or digits bool has_unicode_letdig (const text_t &text) { if (text.empty()) return false; text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); while (here != end) { if (is_unicode_letdig (*here)) return true; ++here; } return false; } // checks to see if a text_t starts with the specified prefix bool starts_with(const text_t& text, const text_t& prefix) { if (prefix.empty()) return true; if (text.empty() || text.size() 0) { output.push_back (*here); // append this character ++here; --len; } start = (char *)here; // save current position status = finished; } // will treat the text_t as a 8-bit string and convert // it to a 16-bit string using the about convert method. text_t inconvertclass::convert (const text_t &t) { text_t out; text_t tmpout; status_t status; text_t::const_iterator here = t.begin(); text_t::const_iterator end = t.end(); unsigned char cbuf[256]; size_t cbuflen = 0; out.clear(); if (out.capacity() < t.size() + 2) out.reserve(t.size() + 2); while (here != end) { while (here != end && cbuflen < 256) { cbuf[cbuflen++] = (unsigned char)(*here & 0xff); ++here; } if (cbuflen > 0) { setinput ((char *)cbuf, cbuflen); status = unfinished; while (status == unfinished) { convert (tmpout, status); out += tmpout; } cbuflen = 0; } } out.setencoding (0); // unicode return out; } // an instance of the default inconvertclass to do simple // conversions. Note that any functions that use this are // not reentrant. If a function needs to be reentrant it // should declare its own instance. inconvertclass ascii2text_t; //////////////////////////////////// // outconvertclass methods //////////////////////////////////// // Convert from a text_t class to a char stream // This default version assumes the output is a ascii // character array. If you set the output stream you // can use this class to output to a stream using the // << operator. The << operator can also be conveniently // used to set the output stream by doing something like // // cout << text_t2ascii << text_tstr << anothertext_tstr; // outconvertclass::outconvertclass () { input = NULL; outs = NULL; } void outconvertclass::reset () { input = NULL; outs = NULL; } void outconvertclass::setinput (text_t *theinput) { input = theinput; if (input != NULL) texthere = input->begin(); } void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere) { input = theinput; texthere = thetexthere; } void outconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { if (input == NULL || output == NULL) { status = finished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while ((len < maxlen) && (texthere != textend)) { if (*texthere < 256) *uoutput = (unsigned char)(*texthere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*texthere)) *uoutput = ' '; else *uoutput = '?'; } ++uoutput; ++len; ++texthere; } if (texthere == textend) status = finished; else status = unfinished; } // will convert the 16-bit string to a 8-bit stream // and place the result in a text_t. This method uses // the above convert function. text_t outconvertclass::convert (const text_t &t) { text_t out; unsigned char cbuf[256]; size_t cbuflen = 0; status_t status = unfinished; out.clear(); if (out.capacity() < t.size() + 2) out.reserve(t.size() + 2); setinput ((text_t *)&t); // discard constant while (status == unfinished) { convert ((char *)cbuf, 256, cbuflen, status); out.appendcarr ((char *)cbuf, cbuflen); } out.setencoding (1); // other encoding return out; } void outconvertclass::setostream (ostream *theouts) { outs = theouts; } ostream *outconvertclass::getostream () { return outs; } // an instance of the default outconvertclass to do simple // conversions outconvertclass text_t2ascii; // stream operators for the output class outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter) { outconverter.setostream(&theouts); return outconverter; } #define STREAMBUFSIZE 256 outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t) { ostream *outstream = outconverter.getostream(); if (outstream == NULL) return outconverter; char outbuf[STREAMBUFSIZE]; size_t len; outconvertclass::status_t status = outconvertclass::unfinished; // assume that there is no data needing converting // left in the converter outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion while (status == outconvertclass::unfinished) { outconverter.convert (outbuf, STREAMBUFSIZE, len, status); if (len > 0) outstream->write(outbuf, len); } return outconverter; }