Context Navigation

source: branches/New_Config_Format-branch/gsdl/lib/text_t.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago
merged changes to trunk into New_Config_Format branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.6 KB

Rev	Line
[1076]	1	/**********************************************************************
	2	*
	3	* text_t.cpp -- a simple 16-bit character string class
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	* $Id: text_t.cpp 1279 2000-07-12 22:21:53Z sjboddie $
	25	*
	26	*********************************************************************/
	27
	28	/*
	29	$Log$
[1279]	30	Revision 1.17.2.1 2000/07/12 22:20:56 sjboddie
	31	merged changes to trunk into New_Config_Format branch
	32
	33	Revision 1.18 2000/04/14 02:50:12 sjboddie
	34	added text_t versions of joinchar to work with sets and lists
	35
[1076]	36	Revision 1.17 2000/04/06 19:58:03 cs025
	37	Correcting a correction - reinstated all lib files due to silly
	38	CVS confusion.
	39
	40	Revision 1.15 1999/10/14 22:52:39 sjboddie
	41	joinchar can join using text_t string now too
	42
	43	Revision 1.14 1999/09/24 02:30:03 rjmcnab
	44	added function has_unicode_letdig
	45
	46	Revision 1.13 1999/09/07 04:57:43 sjboddie
	47	added gpl notice
	48
	49	Revision 1.12 1999/08/31 08:04:41 rjmcnab
	50	Fixed a small but hard to find bug in getcarr
	51
	52	Revision 1.11 1999/07/01 04:05:09 rjmcnab
	53	Optimised append functions slightly and added a reserve function.
	54
	55	Revision 1.10 1999/04/26 03:58:03 sjboddie
	56	added is_number function
	57
	58	Revision 1.9 1999/04/06 22:17:24 rjmcnab
	59	Added splits and joins using text_tset.
	60
	61	Revision 1.8 1999/02/28 23:14:41 rjmcnab
	62
	63	Added uc and lc to convert to uppercase and lowercase.
	64
	65	Revision 1.7 1999/02/21 22:26:39 rjmcnab
	66
	67	Made getint() a constant function.
	68
	69	Revision 1.6 1999/02/03 01:13:26 sjboddie
	70
	71	Got interface to handle subcollections and language subcollections -
	72	committed changes made to some of the collections
	73
	74	Revision 1.5 1999/01/19 01:38:14 rjmcnab
	75
	76	Made the source more portable.
	77
	78	Revision 1.4 1999/01/12 01:51:00 rjmcnab
	79
	80	Standard header.
	81
	82	Revision 1.3 1999/01/08 02:33:16 rjmcnab
	83
	84	Added standard header to source files.
	85
	86	*/
	87
	88
	89	#include "text_t.h"
	90
	91	#if defined(GSDL_USE_OBJECTSPACE)
	92	# include <ospace\std\algorithm>
	93	#elif defined(GSDL_USE_STL_H)
	94	# if defined(GSDL_USE_ALGO_H)
	95	# include <algo.h>
	96	# else
	97	# include <algorithm.h>
	98	# endif
	99	#else
	100	# include <algorithm>
	101	#endif
	102
	103
	104	#include "unitool.h"
	105
	106	////////////////////////////////////
	107	// text_t methods
	108	////////////////////////////////////
	109
	110	text_t::text_t ()
	111	{
	112	setencoding(0);
	113	clear ();
	114	}
	115
	116	text_t::text_t (int i)
	117	{
	118	setencoding(0);
	119	clear ();
	120	appendint (i);
	121	}
	122
	123	text_t::text_t (char *s)
	124	{
	125	setencoding(0);
	126	clear ();
	127	appendcstr (s);
	128	}
	129
	130	void text_t::append (const text_t &t)
	131	{
	132	text.insert(text.end(), t.begin(), t.end());
	133	// const_iterator here, end=t.end();
	134	// for (here=t.begin(); here!=end;here++)
	135	// {
	136	// text.push_back(*here);
	137	// }
	138	}
	139
	140	void text_t::appendrange (iterator first, iterator last)
	141	{
	142	text.insert(text.end(), first, last);
	143	// while (first != last)
	144	// {
	145	// text.push_back (*first);
	146	// first++;
	147	// }
	148	}
	149
	150	void text_t::appendrange (const_iterator first, const_iterator last)
	151	{
	152	text.insert(text.end(), first, last);
	153	// while (first != last)
	154	// {
	155	// text.push_back (*first);
	156	// first++;
	157	// }
	158	}
	159
	160	void text_t::appendint (int i)
	161	{
	162	// deal with zeros and negatives
	163	if (i == 0)
	164	{
	165	text.push_back('0');
	166	return;
	167	}
	168	else if (i < 0)
	169	{
	170	text.push_back('-');
	171	i *= -1;
	172	}
	173
	174	// get a buffer for the conversion
	175	int maxbuflen = sizeof(int)*3;
	176	char *buf = new char[maxbuflen];
	177	int len = 0;
	178
	179	// get the number in reverse
	180	while (i > 0)
	181	{
	182	buf[len++] = '0'+ (i%10);
	183	i = i/10;
	184	}
	185
	186	// reverse the number
	187	while (len > 0)
	188	{
	189	text.push_back(buf[--len]);
	190	}
	191
	192	delete buf;
	193	}
	194
	195	int text_t::getint () const
	196	{
	197	int i = 0;
	198	int mult = 1; // become -1 for negative numbers
	199
	200	const_iterator here = text.begin();
	201	const_iterator end = text.end();
	202
	203	// do plus and minus signs
	204	if (here != end)
	205	{
	206	if (*here == '-')
	207	{
	208	mult = -1;
	209	here++;
	210	}
	211	else if (*here == '+')
	212	{
	213	mult = 1;
	214	here++;
	215	}
	216	}
	217
	218	// deal with the number
	219	while ((here != end) && (here >= '0') && (here <= '9'))
	220	{
	221	i = 10i + (here - '0');
	222	here++;
	223	}
	224
	225	i *= mult;
	226	return i;
	227	}
	228
	229
	230
	231	void text_t::appendcarr (char *s, size_type len)
	232	{
	233	unsigned char us = (unsigned char )s;
	234	while (len > 0)
	235	{
	236	text.push_back (*us); // append this character
	237	us++;
	238	len--;
	239	}
	240	}
	241
	242	void text_t::appendcstr (char *s)
	243	{
	244	unsigned char us = (unsigned char )s;
	245	while (*us != '\0')
	246	{
	247	text.push_back (*us); // append this character
	248	us++;
	249	}
	250	}
	251
	252
	253	// strings returned from getcarr and getcstr become the callers
	254	// responsibility and should be deallocated with "delete"
	255
	256	char *text_t::getcarr(size_type &len) const
	257	{
	258	unsigned char *cstr = new unsigned char[size()];
	259	len = 0;
	260
	261	const_iterator ithere = begin();
	262	const_iterator itend = end();
	263	while (ithere != itend)
	264	{
	265	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
	266	else {
	267	// put a space or a question mark depending on what
	268	// the character is. Question marks tell the user that
	269	// they are missing some information.
	270	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	271	else cstr[len] = '?';
	272	}
	273	len++;
	274	ithere++;
	275	}
	276
	277	return (char *)cstr;
	278	}
	279
	280	char *text_t::getcstr() const
	281	{
	282	unsigned char *cstr = new unsigned char[size() + 1];
	283	const_iterator ithere = begin();
	284	const_iterator itend = end();
	285	int len = 0;
	286
	287	while (ithere != itend)
	288	{
	289	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
	290	else {
	291	// put a space or a question mark depending on what
	292	// the character is. Question marks tell the user that
	293	// they are missing some information.
	294	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	295	else cstr[len] = '?';
	296	}
	297	len++;
	298	ithere++;
	299	}
	300
	301	cstr[len] = '\0';
	302
	303	return (char *)cstr;
	304	}
	305
	306
	307	// general functions which work on text_ts
	308
	309	// find a character within a range
	310	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
	311	unsigned short c)
	312	{
	313	while (first != last)
	314	{
	315	if (*first == c) break;
	316	first++;
	317	}
	318	return first;
	319	}
	320
	321	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
	322	unsigned short c)
	323	{
	324	while (first != last)
	325	{
	326	if (*first == c) break;
	327	first++;
	328	}
	329	return first;
	330	}
	331
	332	// get a string up to the next delimiter (which is skipped)
	333	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
	334	text_t::const_iterator last,
	335	unsigned short c, text_t &outstr)
	336	{
	337	text_t::const_iterator here = first;
	338	here = findchar (first, last, c);
	339	outstr.clear();
	340	outstr.appendrange (first, here);
	341	if (here != last) here++; // skip c
	342	return here;
	343	}
	344
	345	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
	346	unsigned short c, text_t &outstr)
	347	{
	348	text_t::iterator here = first;
	349	here = findchar (first, last, c);
	350	outstr.clear();
	351	outstr.appendrange (first, here);
	352	if (here != last) here++; // skip c
	353	return here;
	354	}
	355
	356	// split a string with a character
	357	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	358	unsigned short c, text_tset &outlist)
	359	{
	360	outlist.erase(outlist.begin(), outlist.end());
	361
	362	text_t t;
	363
	364	while (first != last)
	365	{
	366	first = getdelimitstr (first, last, c, t);
	367	outlist.insert (t);
	368	}
	369	}
	370
	371	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	372	unsigned short c, text_tlist &outlist)
	373	{
	374	outlist.erase(outlist.begin(), outlist.end());
	375
	376	text_t t;
	377
	378	while (first != last)
	379	{
	380	first = getdelimitstr (first, last, c, t);
	381	outlist.push_back (t);
	382	}
	383	}
	384
	385	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	386	unsigned short c, text_tarray &outlist)
	387	{
	388	outlist.erase(outlist.begin(), outlist.end());
	389
	390	text_t t;
	391
	392	while (first != last)
	393	{
	394	first = getdelimitstr (first, last, c, t);
	395	outlist.push_back (t);
	396	}
	397	}
	398
	399	// join a string using a character
	400	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
	401	{
	402	outtext.clear ();
	403
	404	text_tset::const_iterator here = inlist.begin ();
	405	text_tset::const_iterator end = inlist.end ();
	406	bool first = true;
	407	while (here != end)
	408	{
	409	if (!first) outtext.push_back (c);
	410	first = false;
	411	outtext += *here;
	412	here++;
	413	}
	414	}
	415
	416	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
	417	{
	418	outtext.clear ();
	419
	420	text_tlist::const_iterator here = inlist.begin ();
	421	text_tlist::const_iterator end = inlist.end ();
	422	bool first = true;
	423	while (here != end)
	424	{
	425	if (!first) outtext.push_back (c);
	426	first = false;
	427	outtext += *here;
	428	here++;
	429	}
	430	}
	431
	432	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
	433	{
	434	outtext.clear ();
	435
	436	text_tarray::const_iterator here = inlist.begin ();
	437	text_tarray::const_iterator end = inlist.end ();
	438	bool first = true;
	439	while (here != end)
	440	{
	441	if (!first) outtext.push_back (c);
	442	first = false;
	443	outtext += *here;
	444	here++;
	445	}
	446	}
	447
[1279]	448	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
	449	{
	450	outtext.clear ();
	451
	452	text_tlist::const_iterator here = inlist.begin ();
	453	text_tlist::const_iterator end = inlist.end ();
	454	bool first = true;
	455	while (here != end)
	456	{
	457	if (!first) outtext += c;
	458	first = false;
	459	outtext += *here;
	460	here++;
	461	}
	462	}
	463
	464	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
	465	{
	466	outtext.clear ();
	467
	468	text_tset::const_iterator here = inlist.begin ();
	469	text_tset::const_iterator end = inlist.end ();
	470	bool first = true;
	471	while (here != end)
	472	{
	473	if (!first) outtext += c;
	474	first = false;
	475	outtext += *here;
	476	here++;
	477	}
	478	}
	479
[1076]	480	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
	481	{
	482	outtext.clear ();
	483
	484	text_tarray::const_iterator here = inlist.begin ();
	485	text_tarray::const_iterator end = inlist.end ();
	486	bool first = true;
	487	while (here != end)
	488	{
	489	if (!first) outtext += c;
	490	first = false;
	491	outtext += *here;
	492	here++;
	493	}
	494	}
	495
	496	// count the occurances of a character within a range
	497	int countchar (text_t::const_iterator first, text_t::const_iterator last,
	498	unsigned short c)
	499	{
	500	int count = 0;
	501	while (first != last) {
	502	if (*first == c) count ++;
	503	first ++;
	504	}
	505	return count;
	506	}
	507
	508	// return a substring of string from first up to but not including last
	509	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
	510
	511	text_t substr;
	512	while (first != last) {
	513	substr.push_back(*first);
	514	first ++;
	515	}
	516	return substr;
	517	}
	518
	519
	520	// convert to lowercase
	521	void lc (text_t::iterator first, text_t::iterator last) {
	522	while (first != last) {
	523	first = unicode_tolower(first);
	524	first++;
	525	}
	526	}
	527
	528	// convert to uppercase
	529	void uc (text_t::iterator first, text_t::iterator last) {
	530	while (first != last) {
	531	first = unicode_toupper(first);
	532	first++;
	533	}
	534	}
	535
	536
	537	// checks to see if it is a number (i.e. contains only 0-9)
	538	bool is_number (const text_t &text) {
	539
	540	text_t::const_iterator here = text.begin();
	541	text_t::const_iterator end = text.end();
	542
	543	while (here != end) {
	544	if ((here!='0') && (here!='1') && (*here!='2') &&
	545	(here!='3') && (here!='4') && (*here!='5') &&
	546	(here!='6') && (here!='7') && (*here!='8') &&
	547	(*here!='9')) return false;
	548	here ++;
	549	}
	550	return true;
	551	}
	552
	553
	554	// checks to see if the text has any letters or digits
	555	bool has_unicode_letdig (const text_t &text) {
	556	if (text.empty()) return false;
	557
	558	text_t::const_iterator here = text.begin();
	559	text_t::const_iterator end = text.end();
	560	while (here != end) {
	561	if (is_unicode_letdig (*here)) return true;
	562	here++;
	563	}
	564
	565	return false;
	566	}
	567
	568
	569
	570	////////////////////////////////////
	571	// convertclass methods
	572	////////////////////////////////////
	573
	574	// conversion classes used for getting information in to and out of
	575	// the text_t class.
	576
	577	convertclass::convertclass ()
	578	{
	579	// nothing to do
	580	}
	581
	582	void convertclass::reset ()
	583	{
	584	// nothing to do
	585	}
	586
	587
	588	////////////////////////////////////
	589	// inconvertclass methods
	590	////////////////////////////////////
	591
	592	// convert from a char stream to the text_t class
	593	// the default version assumes the input is a ascii
	594	// character array
	595
	596	inconvertclass::inconvertclass ()
	597	{
	598	start = NULL;
	599	len = 0;
	600	}
	601
	602
	603	void inconvertclass::reset ()
	604	{
	605	start = NULL;
	606	len = 0;
	607	}
	608
	609	void inconvertclass::setinput (char *thestart, size_t thelen)
	610	{
	611	start = thestart;
	612	len = thelen;
	613	}
	614
	615	void inconvertclass::convert (text_t &output, status_t &status)
	616	{
	617	output.clear();
	618
	619	if (start == NULL \|\| len == 0)
	620	{
	621	status = finished;
	622	return;
	623	}
	624
	625	// don't want any funny sign conversions happening
	626	unsigned char here = (unsigned char )start;
	627	while (len > 0)
	628	{
	629	output.push_back (*here); // append this character
	630	++here;
	631	--len;
	632	}
	633
	634	start = (char *)here; // save current position
	635	status = finished;
	636	}
	637
	638	// will treat the text_t as a 8-bit string and convert
	639	// it to a 16-bit string using the about convert method.
	640	text_t inconvertclass::convert (const text_t &t) {
	641	text_t out;
	642	text_t tmpout;
	643	status_t status;
	644	text_t::const_iterator here = t.begin();
	645	text_t::const_iterator end = t.end();
	646	unsigned char cbuf[256];
	647	size_t cbuflen = 0;
	648
	649	while (here != end) {
	650	while (here != end && cbuflen < 256) {
	651	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
	652	here++;
	653	}
	654
	655	if (cbuflen > 0) {
	656	setinput ((char *)cbuf, cbuflen);
	657	status = unfinished;
	658	while (status == unfinished) {
	659	convert (tmpout, status);
	660	out += tmpout;
	661	}
	662	cbuflen = 0;
	663	}
	664	}
	665
	666	out.setencoding (0); // unicode
	667
	668	return out;
	669	}
	670
	671	// an instance of the default inconvertclass to do simple
	672	// conversions. Note that any functions that use this are
	673	// not reentrant. If a function needs to be reentrant it
	674	// should declare its own instance.
	675	inconvertclass ascii2text_t;
	676
	677
	678	////////////////////////////////////
	679	// outconvertclass methods
	680	////////////////////////////////////
	681
	682	// Convert from a text_t class to a char stream
	683	// This default version assumes the output is a ascii
	684	// character array. If you set the output stream you
	685	// can use this class to output to a stream using the
	686	// << operator. The << operator can also be conveniently
	687	// used to set the output stream by doing something like
	688	//
	689	// cout << text_t2ascii << text_tstr << anothertext_tstr;
	690	//
	691	outconvertclass::outconvertclass ()
	692	{
	693	input = NULL;
	694	outs = NULL;
	695	}
	696
	697	void outconvertclass::reset ()
	698	{
	699	input = NULL;
	700	outs = NULL;
	701	}
	702
	703	void outconvertclass::setinput (text_t *theinput)
	704	{
	705	input = theinput;
	706	if (input != NULL) texthere = input->begin();
	707	}
	708
	709	void outconvertclass::convert (char *output, size_t maxlen,
	710	size_t &len, status_t &status)
	711	{
	712	if (input == NULL \|\| output == NULL)
	713	{
	714	status = finished;
	715	return;
	716	}
	717
	718	// don't want any funny sign conversions happening
	719	unsigned char uoutput = (unsigned char )output;
	720	text_t::iterator textend = input->end();
	721	len = 0;
	722	while ((len < maxlen) && (texthere != textend))
	723	{
	724	if (texthere < 256) uoutput = (unsigned char)(*texthere);
	725	else {
	726	// put a space or a question mark depending on what
	727	// the character is. Question marks tell the user that
	728	// they are missing some information.
	729	if (is_unicode_space (texthere)) uoutput = ' ';
	730	else *uoutput = '?';
	731	}
	732	++uoutput;
	733	++len;
	734	++texthere;
	735	}
	736
	737	if (texthere == textend) status = finished;
	738	else status = unfinished;
	739	}
	740
	741	// will convert the 16-bit string to a 8-bit stream
	742	// and place the result in a text_t. This method uses
	743	// the above convert function.
	744	text_t outconvertclass::convert (const text_t &t) {
	745	text_t out;
	746	unsigned char cbuf[256];
	747	size_t cbuflen = 0;
	748	status_t status = unfinished;
	749
	750	setinput ((text_t *)&t); // discard constant
	751	while (status == unfinished) {
	752	convert ((char *)cbuf, 256, cbuflen, status);
	753	out.appendcarr ((char *)cbuf, cbuflen);
	754	}
	755
	756	out.setencoding (1); // other encoding
	757
	758	return out;
	759	}
	760
	761
	762	void outconvertclass::setostream (ostream *theouts)
	763	{
	764	outs = theouts;
	765	}
	766
	767	ostream *outconvertclass::getostream ()
	768	{
	769	return outs;
	770	}
	771
	772
	773
	774
	775	// an instance of the default outconvertclass to do simple
	776	// conversions
	777	outconvertclass text_t2ascii;
	778
	779
	780
	781	// stream operators for the output class
	782
	783	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
	784	{
	785	outconverter.setostream(&theouts);
	786	return outconverter;
	787	}
	788
	789
	790	#define STREAMBUFSIZE 256
	791	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
	792	{
	793	ostream *outstream = outconverter.getostream();
	794
	795	if (outstream == NULL) return outconverter;
	796
	797	char outbuf[STREAMBUFSIZE];
	798	size_t len;
	799	outconvertclass::status_t status = outconvertclass::unfinished;
	800
	801	// assume that there is no data needing converting
	802	// left in the converter
	803	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
	804
	805	while (status == outconvertclass::unfinished)
	806	{
	807	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
	808	if (len > 0) outstream->write(outbuf, len);
	809	}
	810
	811	return outconverter;
	812	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: