/********************************************************************** * * text_t.cpp -- a simple 16-bit character string class * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: text_t.cpp 665 1999-10-14 22:52:39Z sjboddie $ * *********************************************************************/ /* $Log$ Revision 1.15 1999/10/14 22:52:39 sjboddie joinchar can join using text_t string now too Revision 1.14 1999/09/24 02:30:03 rjmcnab added function has_unicode_letdig Revision 1.13 1999/09/07 04:57:43 sjboddie added gpl notice Revision 1.12 1999/08/31 08:04:41 rjmcnab Fixed a small but hard to find bug in getcarr Revision 1.11 1999/07/01 04:05:09 rjmcnab Optimised append functions slightly and added a reserve function. Revision 1.10 1999/04/26 03:58:03 sjboddie added is_number function Revision 1.9 1999/04/06 22:17:24 rjmcnab Added splits and joins using text_tset. Revision 1.8 1999/02/28 23:14:41 rjmcnab Added uc and lc to convert to uppercase and lowercase. Revision 1.7 1999/02/21 22:26:39 rjmcnab Made getint() a constant function. Revision 1.6 1999/02/03 01:13:26 sjboddie Got interface to handle subcollections and language subcollections - committed changes made to some of the collections Revision 1.5 1999/01/19 01:38:14 rjmcnab Made the source more portable. Revision 1.4 1999/01/12 01:51:00 rjmcnab Standard header. Revision 1.3 1999/01/08 02:33:16 rjmcnab Added standard header to source files. */ #include "text_t.h" #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_STL_H) # if defined(GSDL_USE_ALGO_H) # include # else # include # endif #else # include #endif #include "unitool.h" //////////////////////////////////// // text_t methods //////////////////////////////////// text_t::text_t () { setencoding(0); clear (); } text_t::text_t (int i) { setencoding(0); clear (); appendint (i); } text_t::text_t (char *s) { setencoding(0); clear (); appendcstr (s); } void text_t::append (const text_t &t) { text.insert(text.end(), t.begin(), t.end()); // const_iterator here, end=t.end(); // for (here=t.begin(); here!=end;here++) // { // text.push_back(*here); // } } void text_t::appendrange (iterator first, iterator last) { text.insert(text.end(), first, last); // while (first != last) // { // text.push_back (*first); // first++; // } } void text_t::appendrange (const_iterator first, const_iterator last) { text.insert(text.end(), first, last); // while (first != last) // { // text.push_back (*first); // first++; // } } void text_t::appendint (int i) { // deal with zeros and negatives if (i == 0) { text.push_back('0'); return; } else if (i < 0) { text.push_back('-'); i *= -1; } // get a buffer for the conversion int maxbuflen = sizeof(int)*3; char *buf = new char[maxbuflen]; int len = 0; // get the number in reverse while (i > 0) { buf[len++] = '0'+ (i%10); i = i/10; } // reverse the number while (len > 0) { text.push_back(buf[--len]); } delete buf; } int text_t::getint () const { int i = 0; int mult = 1; // become -1 for negative numbers const_iterator here = text.begin(); const_iterator end = text.end(); // do plus and minus signs if (here != end) { if (*here == '-') { mult = -1; here++; } else if (*here == '+') { mult = 1; here++; } } // deal with the number while ((here != end) && (*here >= '0') && (*here <= '9')) { i = 10*i + (*here - '0'); here++; } i *= mult; return i; } void text_t::appendcarr (char *s, size_type len) { unsigned char *us = (unsigned char *)s; while (len > 0) { text.push_back (*us); // append this character us++; len--; } } void text_t::appendcstr (char *s) { unsigned char *us = (unsigned char *)s; while (*us != '\0') { text.push_back (*us); // append this character us++; } } // strings returned from getcarr and getcstr become the callers // responsibility and should be deallocated with "delete" char *text_t::getcarr(size_type &len) const { unsigned char *cstr = new unsigned char[size()]; len = 0; const_iterator ithere = begin(); const_iterator itend = end(); while (ithere != itend) { if (*ithere < 256) cstr[len] = (unsigned char)(*ithere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*ithere)) cstr[len] = ' '; else cstr[len] = '?'; } len++; ithere++; } return (char *)cstr; } char *text_t::getcstr() const { unsigned char *cstr = new unsigned char[size() + 1]; const_iterator ithere = begin(); const_iterator itend = end(); int len = 0; while (ithere != itend) { if (*ithere < 256) cstr[len] = (unsigned char)(*ithere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*ithere)) cstr[len] = ' '; else cstr[len] = '?'; } len++; ithere++; } cstr[len] = '\0'; return (char *)cstr; } // general functions which work on text_ts // find a character within a range text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c) { while (first != last) { if (*first == c) break; first++; } return first; } text_t::iterator findchar (text_t::iterator first, text_t::iterator last, unsigned short c) { while (first != last) { if (*first == c) break; first++; } return first; } // get a string up to the next delimiter (which is skipped) text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_t &outstr) { text_t::const_iterator here = first; here = findchar (first, last, c); outstr.clear(); outstr.appendrange (first, here); if (here != last) here++; // skip c return here; } text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last, unsigned short c, text_t &outstr) { text_t::iterator here = first; here = findchar (first, last, c); outstr.clear(); outstr.appendrange (first, here); if (here != last) here++; // skip c return here; } // split a string with a character void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tset &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.insert (t); } } void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tlist &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.push_back (t); } } void splitchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c, text_tarray &outlist) { outlist.erase(outlist.begin(), outlist.end()); text_t t; while (first != last) { first = getdelimitstr (first, last, c, t); outlist.push_back (t); } } // join a string using a character void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tset::const_iterator here = inlist.begin (); text_tset::const_iterator end = inlist.end (); bool first = true; while (here != end) { if (!first) outtext.push_back (c); first = false; outtext += *here; here++; } } void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tlist::const_iterator here = inlist.begin (); text_tlist::const_iterator end = inlist.end (); bool first = true; while (here != end) { if (!first) outtext.push_back (c); first = false; outtext += *here; here++; } } void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext) { outtext.clear (); text_tarray::const_iterator here = inlist.begin (); text_tarray::const_iterator end = inlist.end (); bool first = true; while (here != end) { if (!first) outtext.push_back (c); first = false; outtext += *here; here++; } } void joinchar (const text_tarray &inlist, text_t c, text_t &outtext) { outtext.clear (); text_tarray::const_iterator here = inlist.begin (); text_tarray::const_iterator end = inlist.end (); bool first = true; while (here != end) { if (!first) outtext += c; first = false; outtext += *here; here++; } } // count the occurances of a character within a range int countchar (text_t::const_iterator first, text_t::const_iterator last, unsigned short c) { int count = 0; while (first != last) { if (*first == c) count ++; first ++; } return count; } // return a substring of string from first up to but not including last text_t substr (text_t::const_iterator first, text_t::const_iterator last) { text_t substr; while (first != last) { substr.push_back(*first); first ++; } return substr; } // convert to lowercase void lc (text_t::iterator first, text_t::iterator last) { while (first != last) { *first = unicode_tolower(*first); first++; } } // convert to uppercase void uc (text_t::iterator first, text_t::iterator last) { while (first != last) { *first = unicode_toupper(*first); first++; } } // checks to see if it is a number (i.e. contains only 0-9) bool is_number (const text_t &text) { text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); while (here != end) { if ((*here!='0') && (*here!='1') && (*here!='2') && (*here!='3') && (*here!='4') && (*here!='5') && (*here!='6') && (*here!='7') && (*here!='8') && (*here!='9')) return false; here ++; } return true; } // checks to see if the text has any letters or digits bool has_unicode_letdig (const text_t &text) { if (text.empty()) return false; text_t::const_iterator here = text.begin(); text_t::const_iterator end = text.end(); while (here != end) { if (is_unicode_letdig (*here)) return true; here++; } return false; } //////////////////////////////////// // convertclass methods //////////////////////////////////// // conversion classes used for getting information in to and out of // the text_t class. convertclass::convertclass () { // nothing to do } void convertclass::reset () { // nothing to do } //////////////////////////////////// // inconvertclass methods //////////////////////////////////// // convert from a char stream to the text_t class // the default version assumes the input is a ascii // character array inconvertclass::inconvertclass () { start = NULL; len = 0; } void inconvertclass::reset () { start = NULL; len = 0; } void inconvertclass::setinput (char *thestart, size_t thelen) { start = thestart; len = thelen; } void inconvertclass::convert (text_t &output, status_t &status) { output.clear(); if (start == NULL || len == 0) { status = finished; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; while (len > 0) { output.push_back (*here); // append this character ++here; --len; } start = (char *)here; // save current position status = finished; } // will treat the text_t as a 8-bit string and convert // it to a 16-bit string using the about convert method. text_t inconvertclass::convert (const text_t &t) { text_t out; text_t tmpout; status_t status; text_t::const_iterator here = t.begin(); text_t::const_iterator end = t.end(); unsigned char cbuf[256]; size_t cbuflen = 0; while (here != end) { while (here != end && cbuflen < 256) { cbuf[cbuflen++] = (unsigned char)(*here & 0xff); here++; } if (cbuflen > 0) { setinput ((char *)cbuf, cbuflen); status = unfinished; while (status == unfinished) { convert (tmpout, status); out += tmpout; } cbuflen = 0; } } out.setencoding (0); // unicode return out; } // an instance of the default inconvertclass to do simple // conversions. Note that any functions that use this are // not reentrant. If a function needs to be reentrant it // should declare its own instance. inconvertclass ascii2text_t; //////////////////////////////////// // outconvertclass methods //////////////////////////////////// // Convert from a text_t class to a char stream // This default version assumes the output is a ascii // character array. If you set the output stream you // can use this class to output to a stream using the // << operator. The << operator can also be conveniently // used to set the output stream by doing something like // // cout << text_t2ascii << text_tstr << anothertext_tstr; // outconvertclass::outconvertclass () { input = NULL; outs = NULL; } void outconvertclass::reset () { input = NULL; outs = NULL; } void outconvertclass::setinput (text_t *theinput) { input = theinput; if (input != NULL) texthere = input->begin(); } void outconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { if (input == NULL || output == NULL) { status = finished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while ((len < maxlen) && (texthere != textend)) { if (*texthere < 256) *uoutput = (unsigned char)(*texthere); else { // put a space or a question mark depending on what // the character is. Question marks tell the user that // they are missing some information. if (is_unicode_space (*texthere)) *uoutput = ' '; else *uoutput = '?'; } ++uoutput; ++len; ++texthere; } if (texthere == textend) status = finished; else status = unfinished; } // will convert the 16-bit string to a 8-bit stream // and place the result in a text_t. This method uses // the above convert function. text_t outconvertclass::convert (const text_t &t) { text_t out; unsigned char cbuf[256]; size_t cbuflen = 0; status_t status = unfinished; setinput ((text_t *)&t); // discard constant while (status == unfinished) { convert ((char *)cbuf, 256, cbuflen, status); out.appendcarr ((char *)cbuf, cbuflen); } out.setencoding (1); // other encoding return out; } void outconvertclass::setostream (ostream *theouts) { outs = theouts; } ostream *outconvertclass::getostream () { return outs; } // an instance of the default outconvertclass to do simple // conversions outconvertclass text_t2ascii; // stream operators for the output class outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter) { outconverter.setostream(&theouts); return outconverter; } #define STREAMBUFSIZE 256 outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t) { ostream *outstream = outconverter.getostream(); if (outstream == NULL) return outconverter; char outbuf[STREAMBUFSIZE]; size_t len; outconvertclass::status_t status = outconvertclass::unfinished; // assume that there is no data needing converting // left in the converter outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion while (status == outconvertclass::unfinished) { outconverter.convert (outbuf, STREAMBUFSIZE, len, status); if (len > 0) outstream->write(outbuf, len); } return outconverter; }