Context Navigation

source: trunk/gsdl/lib/text_t.cpp@ 2298

Last change on this file since 2298 was 1860, checked in by cs025, 23 years ago
Included CORBA branch for first time
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.8 KB

Rev	Line
[1076]	1	/**********************************************************************
	2	*
	3	* text_t.cpp -- a simple 16-bit character string class
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
[1860]	24	* $Id: text_t.cpp 1860 2001-01-25 18:26:45Z cs025 $
	25	*
[1076]	26	*********************************************************************/
	27
[1860]	28	/*
	29	$Log$
	30	Revision 1.20 2001/01/25 18:26:44 cs025
	31	Included CORBA branch for first time
	32
	33	Revision 1.15.2.2 2000/04/05 10:19:38 syeates
	34	added automatic conversion to allow text_t's to be <<'ed to ostreams
	35
	36	Revision 1.15.2.1 2000/04/04 15:02:29 cs025
	37	Corba first commit
	38
	39	Revision 1.15 1999/10/14 22:52:39 sjboddie
	40	joinchar can join using text_t string now too
	41
	42	Revision 1.14 1999/09/24 02:30:03 rjmcnab
	43	added function has_unicode_letdig
	44
	45	Revision 1.13 1999/09/07 04:57:43 sjboddie
	46	added gpl notice
	47
	48	Revision 1.12 1999/08/31 08:04:41 rjmcnab
	49	Fixed a small but hard to find bug in getcarr
	50
	51	Revision 1.11 1999/07/01 04:05:09 rjmcnab
	52	Optimised append functions slightly and added a reserve function.
	53
	54	Revision 1.10 1999/04/26 03:58:03 sjboddie
	55	added is_number function
	56
	57	Revision 1.9 1999/04/06 22:17:24 rjmcnab
	58	Added splits and joins using text_tset.
	59
	60	Revision 1.8 1999/02/28 23:14:41 rjmcnab
	61
	62	Added uc and lc to convert to uppercase and lowercase.
	63
	64	Revision 1.7 1999/02/21 22:26:39 rjmcnab
	65
	66	Made getint() a constant function.
	67
	68	Revision 1.6 1999/02/03 01:13:26 sjboddie
	69
	70	Got interface to handle subcollections and language subcollections -
	71	committed changes made to some of the collections
	72
	73	Revision 1.5 1999/01/19 01:38:14 rjmcnab
	74
	75	Made the source more portable.
	76
	77	Revision 1.4 1999/01/12 01:51:00 rjmcnab
	78
	79	Standard header.
	80
	81	Revision 1.3 1999/01/08 02:33:16 rjmcnab
	82
	83	Added standard header to source files.
	84
	85	*/
	86
[1076]	87	#include "text_t.h"
	88
	89	#if defined(GSDL_USE_OBJECTSPACE)
	90	# include <ospace\std\algorithm>
	91	#elif defined(GSDL_USE_STL_H)
	92	# if defined(GSDL_USE_ALGO_H)
	93	# include <algo.h>
	94	# else
	95	# include <algorithm.h>
	96	# endif
	97	#else
	98	# include <algorithm>
	99	#endif
	100
[1860]	101	#ifdef HAVE_CONFIG_H
	102	# ifdef __WIN32__
	103	# include "WIN32cfg.h"
	104	# else
	105	# include "config.h"
	106	# endif
	107	#endif
[1076]	108
[1860]	109
[1076]	110	#include "unitool.h"
	111
	112	////////////////////////////////////
	113	// text_t methods
	114	////////////////////////////////////
	115
[1860]	116	// new stream converter ...
	117	ostream& operator<< (ostream &o, const text_t text)
	118	{
	119	text_t::const_iterator ithere = text.begin();
	120	text_t::const_iterator itend = text.end();
	121
	122	while (ithere != itend)
	123	{
	124	if (*ithere < 256)
	125	{
	126	o << (unsigned char)(*ithere);
	127	}
	128	else
	129	{
	130	// put a space or a question mark depending on what
	131	// the character is. Question marks tell the user that
	132	// they are missing some information.
	133	if (is_unicode_space (*ithere))
	134	o << ' ';
	135	else
	136	o << '?';
	137	}
	138	ithere++;
	139	}
	140
	141	return o;
	142	}
	143
[1076]	144	text_t::text_t ()
	145	{
	146	setencoding(0);
	147	clear ();
	148	}
	149
	150	text_t::text_t (int i)
	151	{
	152	setencoding(0);
	153	clear ();
	154	appendint (i);
	155	}
	156
	157	text_t::text_t (char *s)
	158	{
	159	setencoding(0);
	160	clear ();
	161	appendcstr (s);
	162	}
	163
[1860]	164
[1076]	165	void text_t::append (const text_t &t)
	166	{
	167	text.insert(text.end(), t.begin(), t.end());
	168	// const_iterator here, end=t.end();
	169	// for (here=t.begin(); here!=end;here++)
	170	// {
	171	// text.push_back(*here);
	172	// }
	173	}
	174
	175	void text_t::appendrange (iterator first, iterator last)
	176	{
	177	text.insert(text.end(), first, last);
	178	// while (first != last)
	179	// {
	180	// text.push_back (*first);
	181	// first++;
	182	// }
	183	}
	184
	185	void text_t::appendrange (const_iterator first, const_iterator last)
	186	{
	187	text.insert(text.end(), first, last);
	188	// while (first != last)
	189	// {
	190	// text.push_back (*first);
	191	// first++;
	192	// }
	193	}
	194
	195	void text_t::appendint (int i)
	196	{
	197	// deal with zeros and negatives
	198	if (i == 0)
	199	{
	200	text.push_back('0');
	201	return;
	202	}
	203	else if (i < 0)
	204	{
	205	text.push_back('-');
	206	i *= -1;
	207	}
	208
	209	// get a buffer for the conversion
	210	int maxbuflen = sizeof(int)*3;
	211	char *buf = new char[maxbuflen];
	212	int len = 0;
	213
	214	// get the number in reverse
	215	while (i > 0)
	216	{
	217	buf[len++] = '0'+ (i%10);
	218	i = i/10;
	219	}
	220
	221	// reverse the number
	222	while (len > 0)
	223	{
	224	text.push_back(buf[--len]);
	225	}
	226
	227	delete buf;
	228	}
	229
	230	int text_t::getint () const
	231	{
	232	int i = 0;
	233	int mult = 1; // become -1 for negative numbers
	234
	235	const_iterator here = text.begin();
	236	const_iterator end = text.end();
	237
	238	// do plus and minus signs
	239	if (here != end)
	240	{
	241	if (*here == '-')
	242	{
	243	mult = -1;
	244	here++;
	245	}
	246	else if (*here == '+')
	247	{
	248	mult = 1;
	249	here++;
	250	}
	251	}
	252
	253	// deal with the number
	254	while ((here != end) && (here >= '0') && (here <= '9'))
	255	{
	256	i = 10i + (here - '0');
	257	here++;
	258	}
	259
	260	i *= mult;
	261	return i;
	262	}
	263
	264
	265
	266	void text_t::appendcarr (char *s, size_type len)
	267	{
	268	unsigned char us = (unsigned char )s;
	269	while (len > 0)
	270	{
	271	text.push_back (*us); // append this character
	272	us++;
	273	len--;
	274	}
	275	}
	276
	277	void text_t::appendcstr (char *s)
	278	{
	279	unsigned char us = (unsigned char )s;
	280	while (*us != '\0')
	281	{
	282	text.push_back (*us); // append this character
	283	us++;
	284	}
	285	}
	286
	287
	288	// strings returned from getcarr and getcstr become the callers
	289	// responsibility and should be deallocated with "delete"
	290
	291	char *text_t::getcarr(size_type &len) const
	292	{
	293	unsigned char *cstr = new unsigned char[size()];
	294	len = 0;
	295
	296	const_iterator ithere = begin();
	297	const_iterator itend = end();
	298	while (ithere != itend)
	299	{
	300	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
	301	else {
	302	// put a space or a question mark depending on what
	303	// the character is. Question marks tell the user that
	304	// they are missing some information.
	305	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	306	else cstr[len] = '?';
	307	}
	308	len++;
	309	ithere++;
	310	}
	311
	312	return (char *)cstr;
	313	}
	314
	315	char *text_t::getcstr() const
	316	{
	317	unsigned char *cstr = new unsigned char[size() + 1];
	318	const_iterator ithere = begin();
	319	const_iterator itend = end();
	320	int len = 0;
	321
	322	while (ithere != itend)
	323	{
	324	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
	325	else {
	326	// put a space or a question mark depending on what
	327	// the character is. Question marks tell the user that
	328	// they are missing some information.
	329	if (is_unicode_space (*ithere)) cstr[len] = ' ';
	330	else cstr[len] = '?';
	331	}
	332	len++;
	333	ithere++;
	334	}
	335
	336	cstr[len] = '\0';
	337
	338	return (char *)cstr;
	339	}
	340
	341
	342	// general functions which work on text_ts
	343
	344	// find a character within a range
	345	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
	346	unsigned short c)
	347	{
	348	while (first != last)
	349	{
	350	if (*first == c) break;
	351	first++;
	352	}
	353	return first;
	354	}
	355
	356	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
	357	unsigned short c)
	358	{
	359	while (first != last)
	360	{
	361	if (*first == c) break;
	362	first++;
	363	}
	364	return first;
	365	}
	366
[1860]	367	text_t::iterator findword (text_t::iterator first, text_t::iterator last,
	368	const text_t& word)
	369	{
	370	text_t::const_iterator word_begin = word.begin();
	371	text_t::const_iterator word_end = word.end();
	372
	373	while (first != last)
	374	{
	375	text_t::iterator char_match = first;
	376	text_t::const_iterator word_here = word_begin;
	377	while (word_here!=word_end)
	378	{
	379	if (char_match != word_here)
	380	{
	381	break;
	382	}
	383	char_match++;
	384	word_here++;
	385	}
	386	if (word_here==word_end)
	387	{
	388	return first;
	389	}
	390	first++;
	391	}
	392	return last; // get to here only if there is no match
	393	}
	394
[1076]	395	// get a string up to the next delimiter (which is skipped)
	396	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
	397	text_t::const_iterator last,
	398	unsigned short c, text_t &outstr)
	399	{
	400	text_t::const_iterator here = first;
	401	here = findchar (first, last, c);
	402	outstr.clear();
	403	outstr.appendrange (first, here);
	404	if (here != last) here++; // skip c
	405	return here;
	406	}
	407
	408	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
	409	unsigned short c, text_t &outstr)
	410	{
	411	text_t::iterator here = first;
	412	here = findchar (first, last, c);
	413	outstr.clear();
	414	outstr.appendrange (first, here);
	415	if (here != last) here++; // skip c
	416	return here;
	417	}
	418
	419	// split a string with a character
	420	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	421	unsigned short c, text_tset &outlist)
	422	{
	423	outlist.erase(outlist.begin(), outlist.end());
	424
	425	text_t t;
	426
	427	while (first != last)
	428	{
	429	first = getdelimitstr (first, last, c, t);
	430	outlist.insert (t);
	431	}
	432	}
	433
	434	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	435	unsigned short c, text_tlist &outlist)
	436	{
	437	outlist.erase(outlist.begin(), outlist.end());
	438
	439	text_t t;
	440
	441	while (first != last)
	442	{
	443	first = getdelimitstr (first, last, c, t);
	444	outlist.push_back (t);
	445	}
	446	}
	447
	448	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
	449	unsigned short c, text_tarray &outlist)
	450	{
	451	outlist.erase(outlist.begin(), outlist.end());
	452
	453	text_t t;
	454
	455	while (first != last)
	456	{
	457	first = getdelimitstr (first, last, c, t);
	458	outlist.push_back (t);
	459	}
	460	}
	461
	462	// join a string using a character
	463	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
	464	{
	465	outtext.clear ();
	466
	467	text_tset::const_iterator here = inlist.begin ();
	468	text_tset::const_iterator end = inlist.end ();
	469	bool first = true;
	470	while (here != end)
	471	{
	472	if (!first) outtext.push_back (c);
	473	first = false;
	474	outtext += *here;
	475	here++;
	476	}
	477	}
	478
	479	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
	480	{
	481	outtext.clear ();
	482
	483	text_tlist::const_iterator here = inlist.begin ();
	484	text_tlist::const_iterator end = inlist.end ();
	485	bool first = true;
	486	while (here != end)
	487	{
	488	if (!first) outtext.push_back (c);
	489	first = false;
	490	outtext += *here;
	491	here++;
	492	}
	493	}
	494
	495	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
	496	{
	497	outtext.clear ();
	498
	499	text_tarray::const_iterator here = inlist.begin ();
	500	text_tarray::const_iterator end = inlist.end ();
	501	bool first = true;
	502	while (here != end)
	503	{
	504	if (!first) outtext.push_back (c);
	505	first = false;
	506	outtext += *here;
	507	here++;
	508	}
	509	}
	510
[1088]	511	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
	512	{
	513	outtext.clear ();
	514
	515	text_tlist::const_iterator here = inlist.begin ();
	516	text_tlist::const_iterator end = inlist.end ();
	517	bool first = true;
	518	while (here != end)
	519	{
	520	if (!first) outtext += c;
	521	first = false;
	522	outtext += *here;
	523	here++;
	524	}
	525	}
	526
	527	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
	528	{
	529	outtext.clear ();
	530
	531	text_tset::const_iterator here = inlist.begin ();
	532	text_tset::const_iterator end = inlist.end ();
	533	bool first = true;
	534	while (here != end)
	535	{
	536	if (!first) outtext += c;
	537	first = false;
	538	outtext += *here;
	539	here++;
	540	}
	541	}
	542
[1076]	543	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
	544	{
	545	outtext.clear ();
	546
	547	text_tarray::const_iterator here = inlist.begin ();
	548	text_tarray::const_iterator end = inlist.end ();
	549	bool first = true;
	550	while (here != end)
	551	{
	552	if (!first) outtext += c;
	553	first = false;
	554	outtext += *here;
	555	here++;
	556	}
	557	}
	558
	559	// count the occurances of a character within a range
	560	int countchar (text_t::const_iterator first, text_t::const_iterator last,
	561	unsigned short c)
	562	{
	563	int count = 0;
	564	while (first != last) {
	565	if (*first == c) count ++;
	566	first ++;
	567	}
	568	return count;
	569	}
	570
	571	// return a substring of string from first up to but not including last
	572	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
	573
	574	text_t substr;
	575	while (first != last) {
	576	substr.push_back(*first);
	577	first ++;
	578	}
	579	return substr;
	580	}
	581
	582
	583	// convert to lowercase
	584	void lc (text_t::iterator first, text_t::iterator last) {
	585	while (first != last) {
	586	first = unicode_tolower(first);
	587	first++;
	588	}
	589	}
	590
	591	// convert to uppercase
	592	void uc (text_t::iterator first, text_t::iterator last) {
	593	while (first != last) {
	594	first = unicode_toupper(first);
	595	first++;
	596	}
	597	}
	598
	599
	600	// checks to see if it is a number (i.e. contains only 0-9)
	601	bool is_number (const text_t &text) {
	602
	603	text_t::const_iterator here = text.begin();
	604	text_t::const_iterator end = text.end();
	605
	606	while (here != end) {
	607	if ((here!='0') && (here!='1') && (*here!='2') &&
	608	(here!='3') && (here!='4') && (*here!='5') &&
	609	(here!='6') && (here!='7') && (*here!='8') &&
	610	(*here!='9')) return false;
	611	here ++;
	612	}
	613	return true;
	614	}
	615
	616
	617	// checks to see if the text has any letters or digits
	618	bool has_unicode_letdig (const text_t &text) {
	619	if (text.empty()) return false;
	620
	621	text_t::const_iterator here = text.begin();
	622	text_t::const_iterator end = text.end();
	623	while (here != end) {
	624	if (is_unicode_letdig (*here)) return true;
	625	here++;
	626	}
	627
	628	return false;
	629	}
	630
	631
	632
	633	////////////////////////////////////
	634	// convertclass methods
	635	////////////////////////////////////
	636
	637	// conversion classes used for getting information in to and out of
	638	// the text_t class.
	639
	640	convertclass::convertclass ()
	641	{
	642	// nothing to do
	643	}
	644
	645	void convertclass::reset ()
	646	{
	647	// nothing to do
	648	}
	649
	650
	651	////////////////////////////////////
	652	// inconvertclass methods
	653	////////////////////////////////////
	654
	655	// convert from a char stream to the text_t class
	656	// the default version assumes the input is a ascii
	657	// character array
	658
	659	inconvertclass::inconvertclass ()
	660	{
	661	start = NULL;
	662	len = 0;
	663	}
	664
	665
	666	void inconvertclass::reset ()
	667	{
	668	start = NULL;
	669	len = 0;
	670	}
	671
	672	void inconvertclass::setinput (char *thestart, size_t thelen)
	673	{
	674	start = thestart;
	675	len = thelen;
	676	}
	677
	678	void inconvertclass::convert (text_t &output, status_t &status)
	679	{
	680	output.clear();
	681
	682	if (start == NULL \|\| len == 0)
	683	{
	684	status = finished;
	685	return;
	686	}
	687
	688	// don't want any funny sign conversions happening
	689	unsigned char here = (unsigned char )start;
	690	while (len > 0)
	691	{
	692	output.push_back (*here); // append this character
	693	++here;
	694	--len;
	695	}
	696
	697	start = (char *)here; // save current position
	698	status = finished;
	699	}
	700
	701	// will treat the text_t as a 8-bit string and convert
	702	// it to a 16-bit string using the about convert method.
	703	text_t inconvertclass::convert (const text_t &t) {
	704	text_t out;
	705	text_t tmpout;
	706	status_t status;
	707	text_t::const_iterator here = t.begin();
	708	text_t::const_iterator end = t.end();
	709	unsigned char cbuf[256];
	710	size_t cbuflen = 0;
	711
	712	while (here != end) {
	713	while (here != end && cbuflen < 256) {
	714	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
	715	here++;
	716	}
	717
	718	if (cbuflen > 0) {
	719	setinput ((char *)cbuf, cbuflen);
	720	status = unfinished;
	721	while (status == unfinished) {
	722	convert (tmpout, status);
	723	out += tmpout;
	724	}
	725	cbuflen = 0;
	726	}
	727	}
	728
	729	out.setencoding (0); // unicode
	730
	731	return out;
	732	}
	733
	734	// an instance of the default inconvertclass to do simple
	735	// conversions. Note that any functions that use this are
	736	// not reentrant. If a function needs to be reentrant it
	737	// should declare its own instance.
	738	inconvertclass ascii2text_t;
	739
	740
	741	////////////////////////////////////
	742	// outconvertclass methods
	743	////////////////////////////////////
	744
	745	// Convert from a text_t class to a char stream
	746	// This default version assumes the output is a ascii
	747	// character array. If you set the output stream you
	748	// can use this class to output to a stream using the
	749	// << operator. The << operator can also be conveniently
	750	// used to set the output stream by doing something like
	751	//
	752	// cout << text_t2ascii << text_tstr << anothertext_tstr;
	753	//
	754	outconvertclass::outconvertclass ()
	755	{
	756	input = NULL;
	757	outs = NULL;
	758	}
	759
	760	void outconvertclass::reset ()
	761	{
	762	input = NULL;
	763	outs = NULL;
	764	}
	765
	766	void outconvertclass::setinput (text_t *theinput)
	767	{
	768	input = theinput;
	769	if (input != NULL) texthere = input->begin();
	770	}
	771
	772	void outconvertclass::convert (char *output, size_t maxlen,
	773	size_t &len, status_t &status)
	774	{
	775	if (input == NULL \|\| output == NULL)
	776	{
	777	status = finished;
	778	return;
	779	}
	780
	781	// don't want any funny sign conversions happening
	782	unsigned char uoutput = (unsigned char )output;
	783	text_t::iterator textend = input->end();
	784	len = 0;
	785	while ((len < maxlen) && (texthere != textend))
	786	{
	787	if (texthere < 256) uoutput = (unsigned char)(*texthere);
	788	else {
	789	// put a space or a question mark depending on what
	790	// the character is. Question marks tell the user that
	791	// they are missing some information.
	792	if (is_unicode_space (texthere)) uoutput = ' ';
	793	else *uoutput = '?';
	794	}
	795	++uoutput;
	796	++len;
	797	++texthere;
	798	}
	799
	800	if (texthere == textend) status = finished;
	801	else status = unfinished;
	802	}
	803
	804	// will convert the 16-bit string to a 8-bit stream
	805	// and place the result in a text_t. This method uses
	806	// the above convert function.
	807	text_t outconvertclass::convert (const text_t &t) {
	808	text_t out;
	809	unsigned char cbuf[256];
	810	size_t cbuflen = 0;
	811	status_t status = unfinished;
	812
	813	setinput ((text_t *)&t); // discard constant
	814	while (status == unfinished) {
	815	convert ((char *)cbuf, 256, cbuflen, status);
	816	out.appendcarr ((char *)cbuf, cbuflen);
	817	}
	818
	819	out.setencoding (1); // other encoding
	820
	821	return out;
	822	}
	823
	824
	825	void outconvertclass::setostream (ostream *theouts)
	826	{
	827	outs = theouts;
	828	}
	829
	830	ostream *outconvertclass::getostream ()
	831	{
	832	return outs;
	833	}
	834
	835
	836
	837
	838	// an instance of the default outconvertclass to do simple
	839	// conversions
	840	outconvertclass text_t2ascii;
	841
	842
	843
	844	// stream operators for the output class
	845
	846	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
	847	{
	848	outconverter.setostream(&theouts);
	849	return outconverter;
	850	}
	851
	852
	853	#define STREAMBUFSIZE 256
	854	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
	855	{
	856	ostream *outstream = outconverter.getostream();
	857
	858	if (outstream == NULL) return outconverter;
	859
	860	char outbuf[STREAMBUFSIZE];
	861	size_t len;
	862	outconvertclass::status_t status = outconvertclass::unfinished;
	863
	864	// assume that there is no data needing converting
	865	// left in the converter
	866	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
	867
	868	while (status == outconvertclass::unfinished)
	869	{
	870	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
	871	if (len > 0) outstream->write(outbuf, len);
	872	}
	873
	874	return outconverter;
	875	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: