Context Navigation

source: main/tags/2.23/gsdl/lib/text_t.cpp@ 33158

Last change on this file since 33158 was 1088, checked in by sjboddie, 24 years ago
added text_t versions of joinchar to work with sets and lists
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.5 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 1088 2000-04-14 02:50:12Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.18 2000/04/14 02:50:12 sjboddie
31	added text_t versions of joinchar to work with sets and lists
32
33	Revision 1.17 2000/04/06 19:58:03 cs025
34	Correcting a correction - reinstated all lib files due to silly
35	CVS confusion.
36
37	Revision 1.15 1999/10/14 22:52:39 sjboddie
38	joinchar can join using text_t string now too
39
40	Revision 1.14 1999/09/24 02:30:03 rjmcnab
41	added function has_unicode_letdig
42
43	Revision 1.13 1999/09/07 04:57:43 sjboddie
44	added gpl notice
45
46	Revision 1.12 1999/08/31 08:04:41 rjmcnab
47	Fixed a small but hard to find bug in getcarr
48
49	Revision 1.11 1999/07/01 04:05:09 rjmcnab
50	Optimised append functions slightly and added a reserve function.
51
52	Revision 1.10 1999/04/26 03:58:03 sjboddie
53	added is_number function
54
55	Revision 1.9 1999/04/06 22:17:24 rjmcnab
56	Added splits and joins using text_tset.
57
58	Revision 1.8 1999/02/28 23:14:41 rjmcnab
59
60	Added uc and lc to convert to uppercase and lowercase.
61
62	Revision 1.7 1999/02/21 22:26:39 rjmcnab
63
64	Made getint() a constant function.
65
66	Revision 1.6 1999/02/03 01:13:26 sjboddie
67
68	Got interface to handle subcollections and language subcollections -
69	committed changes made to some of the collections
70
71	Revision 1.5 1999/01/19 01:38:14 rjmcnab
72
73	Made the source more portable.
74
75	Revision 1.4 1999/01/12 01:51:00 rjmcnab
76
77	Standard header.
78
79	Revision 1.3 1999/01/08 02:33:16 rjmcnab
80
81	Added standard header to source files.
82
83	*/
84
85
86	#include "text_t.h"
87
88	#if defined(GSDL_USE_OBJECTSPACE)
89	# include <ospace\std\algorithm>
90	#elif defined(GSDL_USE_STL_H)
91	# if defined(GSDL_USE_ALGO_H)
92	# include <algo.h>
93	# else
94	# include <algorithm.h>
95	# endif
96	#else
97	# include <algorithm>
98	#endif
99
100
101	#include "unitool.h"
102
103	////////////////////////////////////
104	// text_t methods
105	////////////////////////////////////
106
107	text_t::text_t ()
108	{
109	setencoding(0);
110	clear ();
111	}
112
113	text_t::text_t (int i)
114	{
115	setencoding(0);
116	clear ();
117	appendint (i);
118	}
119
120	text_t::text_t (char *s)
121	{
122	setencoding(0);
123	clear ();
124	appendcstr (s);
125	}
126
127	void text_t::append (const text_t &t)
128	{
129	text.insert(text.end(), t.begin(), t.end());
130	// const_iterator here, end=t.end();
131	// for (here=t.begin(); here!=end;here++)
132	// {
133	// text.push_back(*here);
134	// }
135	}
136
137	void text_t::appendrange (iterator first, iterator last)
138	{
139	text.insert(text.end(), first, last);
140	// while (first != last)
141	// {
142	// text.push_back (*first);
143	// first++;
144	// }
145	}
146
147	void text_t::appendrange (const_iterator first, const_iterator last)
148	{
149	text.insert(text.end(), first, last);
150	// while (first != last)
151	// {
152	// text.push_back (*first);
153	// first++;
154	// }
155	}
156
157	void text_t::appendint (int i)
158	{
159	// deal with zeros and negatives
160	if (i == 0)
161	{
162	text.push_back('0');
163	return;
164	}
165	else if (i < 0)
166	{
167	text.push_back('-');
168	i *= -1;
169	}
170
171	// get a buffer for the conversion
172	int maxbuflen = sizeof(int)*3;
173	char *buf = new char[maxbuflen];
174	int len = 0;
175
176	// get the number in reverse
177	while (i > 0)
178	{
179	buf[len++] = '0'+ (i%10);
180	i = i/10;
181	}
182
183	// reverse the number
184	while (len > 0)
185	{
186	text.push_back(buf[--len]);
187	}
188
189	delete buf;
190	}
191
192	int text_t::getint () const
193	{
194	int i = 0;
195	int mult = 1; // become -1 for negative numbers
196
197	const_iterator here = text.begin();
198	const_iterator end = text.end();
199
200	// do plus and minus signs
201	if (here != end)
202	{
203	if (*here == '-')
204	{
205	mult = -1;
206	here++;
207	}
208	else if (*here == '+')
209	{
210	mult = 1;
211	here++;
212	}
213	}
214
215	// deal with the number
216	while ((here != end) && (here >= '0') && (here <= '9'))
217	{
218	i = 10i + (here - '0');
219	here++;
220	}
221
222	i *= mult;
223	return i;
224	}
225
226
227
228	void text_t::appendcarr (char *s, size_type len)
229	{
230	unsigned char us = (unsigned char )s;
231	while (len > 0)
232	{
233	text.push_back (*us); // append this character
234	us++;
235	len--;
236	}
237	}
238
239	void text_t::appendcstr (char *s)
240	{
241	unsigned char us = (unsigned char )s;
242	while (*us != '\0')
243	{
244	text.push_back (*us); // append this character
245	us++;
246	}
247	}
248
249
250	// strings returned from getcarr and getcstr become the callers
251	// responsibility and should be deallocated with "delete"
252
253	char *text_t::getcarr(size_type &len) const
254	{
255	unsigned char *cstr = new unsigned char[size()];
256	len = 0;
257
258	const_iterator ithere = begin();
259	const_iterator itend = end();
260	while (ithere != itend)
261	{
262	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
263	else {
264	// put a space or a question mark depending on what
265	// the character is. Question marks tell the user that
266	// they are missing some information.
267	if (is_unicode_space (*ithere)) cstr[len] = ' ';
268	else cstr[len] = '?';
269	}
270	len++;
271	ithere++;
272	}
273
274	return (char *)cstr;
275	}
276
277	char *text_t::getcstr() const
278	{
279	unsigned char *cstr = new unsigned char[size() + 1];
280	const_iterator ithere = begin();
281	const_iterator itend = end();
282	int len = 0;
283
284	while (ithere != itend)
285	{
286	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
287	else {
288	// put a space or a question mark depending on what
289	// the character is. Question marks tell the user that
290	// they are missing some information.
291	if (is_unicode_space (*ithere)) cstr[len] = ' ';
292	else cstr[len] = '?';
293	}
294	len++;
295	ithere++;
296	}
297
298	cstr[len] = '\0';
299
300	return (char *)cstr;
301	}
302
303
304	// general functions which work on text_ts
305
306	// find a character within a range
307	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
308	unsigned short c)
309	{
310	while (first != last)
311	{
312	if (*first == c) break;
313	first++;
314	}
315	return first;
316	}
317
318	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
319	unsigned short c)
320	{
321	while (first != last)
322	{
323	if (*first == c) break;
324	first++;
325	}
326	return first;
327	}
328
329	// get a string up to the next delimiter (which is skipped)
330	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
331	text_t::const_iterator last,
332	unsigned short c, text_t &outstr)
333	{
334	text_t::const_iterator here = first;
335	here = findchar (first, last, c);
336	outstr.clear();
337	outstr.appendrange (first, here);
338	if (here != last) here++; // skip c
339	return here;
340	}
341
342	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
343	unsigned short c, text_t &outstr)
344	{
345	text_t::iterator here = first;
346	here = findchar (first, last, c);
347	outstr.clear();
348	outstr.appendrange (first, here);
349	if (here != last) here++; // skip c
350	return here;
351	}
352
353	// split a string with a character
354	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
355	unsigned short c, text_tset &outlist)
356	{
357	outlist.erase(outlist.begin(), outlist.end());
358
359	text_t t;
360
361	while (first != last)
362	{
363	first = getdelimitstr (first, last, c, t);
364	outlist.insert (t);
365	}
366	}
367
368	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
369	unsigned short c, text_tlist &outlist)
370	{
371	outlist.erase(outlist.begin(), outlist.end());
372
373	text_t t;
374
375	while (first != last)
376	{
377	first = getdelimitstr (first, last, c, t);
378	outlist.push_back (t);
379	}
380	}
381
382	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
383	unsigned short c, text_tarray &outlist)
384	{
385	outlist.erase(outlist.begin(), outlist.end());
386
387	text_t t;
388
389	while (first != last)
390	{
391	first = getdelimitstr (first, last, c, t);
392	outlist.push_back (t);
393	}
394	}
395
396	// join a string using a character
397	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
398	{
399	outtext.clear ();
400
401	text_tset::const_iterator here = inlist.begin ();
402	text_tset::const_iterator end = inlist.end ();
403	bool first = true;
404	while (here != end)
405	{
406	if (!first) outtext.push_back (c);
407	first = false;
408	outtext += *here;
409	here++;
410	}
411	}
412
413	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
414	{
415	outtext.clear ();
416
417	text_tlist::const_iterator here = inlist.begin ();
418	text_tlist::const_iterator end = inlist.end ();
419	bool first = true;
420	while (here != end)
421	{
422	if (!first) outtext.push_back (c);
423	first = false;
424	outtext += *here;
425	here++;
426	}
427	}
428
429	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
430	{
431	outtext.clear ();
432
433	text_tarray::const_iterator here = inlist.begin ();
434	text_tarray::const_iterator end = inlist.end ();
435	bool first = true;
436	while (here != end)
437	{
438	if (!first) outtext.push_back (c);
439	first = false;
440	outtext += *here;
441	here++;
442	}
443	}
444
445	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
446	{
447	outtext.clear ();
448
449	text_tlist::const_iterator here = inlist.begin ();
450	text_tlist::const_iterator end = inlist.end ();
451	bool first = true;
452	while (here != end)
453	{
454	if (!first) outtext += c;
455	first = false;
456	outtext += *here;
457	here++;
458	}
459	}
460
461	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
462	{
463	outtext.clear ();
464
465	text_tset::const_iterator here = inlist.begin ();
466	text_tset::const_iterator end = inlist.end ();
467	bool first = true;
468	while (here != end)
469	{
470	if (!first) outtext += c;
471	first = false;
472	outtext += *here;
473	here++;
474	}
475	}
476
477	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
478	{
479	outtext.clear ();
480
481	text_tarray::const_iterator here = inlist.begin ();
482	text_tarray::const_iterator end = inlist.end ();
483	bool first = true;
484	while (here != end)
485	{
486	if (!first) outtext += c;
487	first = false;
488	outtext += *here;
489	here++;
490	}
491	}
492
493	// count the occurances of a character within a range
494	int countchar (text_t::const_iterator first, text_t::const_iterator last,
495	unsigned short c)
496	{
497	int count = 0;
498	while (first != last) {
499	if (*first == c) count ++;
500	first ++;
501	}
502	return count;
503	}
504
505	// return a substring of string from first up to but not including last
506	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
507
508	text_t substr;
509	while (first != last) {
510	substr.push_back(*first);
511	first ++;
512	}
513	return substr;
514	}
515
516
517	// convert to lowercase
518	void lc (text_t::iterator first, text_t::iterator last) {
519	while (first != last) {
520	first = unicode_tolower(first);
521	first++;
522	}
523	}
524
525	// convert to uppercase
526	void uc (text_t::iterator first, text_t::iterator last) {
527	while (first != last) {
528	first = unicode_toupper(first);
529	first++;
530	}
531	}
532
533
534	// checks to see if it is a number (i.e. contains only 0-9)
535	bool is_number (const text_t &text) {
536
537	text_t::const_iterator here = text.begin();
538	text_t::const_iterator end = text.end();
539
540	while (here != end) {
541	if ((here!='0') && (here!='1') && (*here!='2') &&
542	(here!='3') && (here!='4') && (*here!='5') &&
543	(here!='6') && (here!='7') && (*here!='8') &&
544	(*here!='9')) return false;
545	here ++;
546	}
547	return true;
548	}
549
550
551	// checks to see if the text has any letters or digits
552	bool has_unicode_letdig (const text_t &text) {
553	if (text.empty()) return false;
554
555	text_t::const_iterator here = text.begin();
556	text_t::const_iterator end = text.end();
557	while (here != end) {
558	if (is_unicode_letdig (*here)) return true;
559	here++;
560	}
561
562	return false;
563	}
564
565
566
567	////////////////////////////////////
568	// convertclass methods
569	////////////////////////////////////
570
571	// conversion classes used for getting information in to and out of
572	// the text_t class.
573
574	convertclass::convertclass ()
575	{
576	// nothing to do
577	}
578
579	void convertclass::reset ()
580	{
581	// nothing to do
582	}
583
584
585	////////////////////////////////////
586	// inconvertclass methods
587	////////////////////////////////////
588
589	// convert from a char stream to the text_t class
590	// the default version assumes the input is a ascii
591	// character array
592
593	inconvertclass::inconvertclass ()
594	{
595	start = NULL;
596	len = 0;
597	}
598
599
600	void inconvertclass::reset ()
601	{
602	start = NULL;
603	len = 0;
604	}
605
606	void inconvertclass::setinput (char *thestart, size_t thelen)
607	{
608	start = thestart;
609	len = thelen;
610	}
611
612	void inconvertclass::convert (text_t &output, status_t &status)
613	{
614	output.clear();
615
616	if (start == NULL \|\| len == 0)
617	{
618	status = finished;
619	return;
620	}
621
622	// don't want any funny sign conversions happening
623	unsigned char here = (unsigned char )start;
624	while (len > 0)
625	{
626	output.push_back (*here); // append this character
627	++here;
628	--len;
629	}
630
631	start = (char *)here; // save current position
632	status = finished;
633	}
634
635	// will treat the text_t as a 8-bit string and convert
636	// it to a 16-bit string using the about convert method.
637	text_t inconvertclass::convert (const text_t &t) {
638	text_t out;
639	text_t tmpout;
640	status_t status;
641	text_t::const_iterator here = t.begin();
642	text_t::const_iterator end = t.end();
643	unsigned char cbuf[256];
644	size_t cbuflen = 0;
645
646	while (here != end) {
647	while (here != end && cbuflen < 256) {
648	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
649	here++;
650	}
651
652	if (cbuflen > 0) {
653	setinput ((char *)cbuf, cbuflen);
654	status = unfinished;
655	while (status == unfinished) {
656	convert (tmpout, status);
657	out += tmpout;
658	}
659	cbuflen = 0;
660	}
661	}
662
663	out.setencoding (0); // unicode
664
665	return out;
666	}
667
668	// an instance of the default inconvertclass to do simple
669	// conversions. Note that any functions that use this are
670	// not reentrant. If a function needs to be reentrant it
671	// should declare its own instance.
672	inconvertclass ascii2text_t;
673
674
675	////////////////////////////////////
676	// outconvertclass methods
677	////////////////////////////////////
678
679	// Convert from a text_t class to a char stream
680	// This default version assumes the output is a ascii
681	// character array. If you set the output stream you
682	// can use this class to output to a stream using the
683	// << operator. The << operator can also be conveniently
684	// used to set the output stream by doing something like
685	//
686	// cout << text_t2ascii << text_tstr << anothertext_tstr;
687	//
688	outconvertclass::outconvertclass ()
689	{
690	input = NULL;
691	outs = NULL;
692	}
693
694	void outconvertclass::reset ()
695	{
696	input = NULL;
697	outs = NULL;
698	}
699
700	void outconvertclass::setinput (text_t *theinput)
701	{
702	input = theinput;
703	if (input != NULL) texthere = input->begin();
704	}
705
706	void outconvertclass::convert (char *output, size_t maxlen,
707	size_t &len, status_t &status)
708	{
709	if (input == NULL \|\| output == NULL)
710	{
711	status = finished;
712	return;
713	}
714
715	// don't want any funny sign conversions happening
716	unsigned char uoutput = (unsigned char )output;
717	text_t::iterator textend = input->end();
718	len = 0;
719	while ((len < maxlen) && (texthere != textend))
720	{
721	if (texthere < 256) uoutput = (unsigned char)(*texthere);
722	else {
723	// put a space or a question mark depending on what
724	// the character is. Question marks tell the user that
725	// they are missing some information.
726	if (is_unicode_space (texthere)) uoutput = ' ';
727	else *uoutput = '?';
728	}
729	++uoutput;
730	++len;
731	++texthere;
732	}
733
734	if (texthere == textend) status = finished;
735	else status = unfinished;
736	}
737
738	// will convert the 16-bit string to a 8-bit stream
739	// and place the result in a text_t. This method uses
740	// the above convert function.
741	text_t outconvertclass::convert (const text_t &t) {
742	text_t out;
743	unsigned char cbuf[256];
744	size_t cbuflen = 0;
745	status_t status = unfinished;
746
747	setinput ((text_t *)&t); // discard constant
748	while (status == unfinished) {
749	convert ((char *)cbuf, 256, cbuflen, status);
750	out.appendcarr ((char *)cbuf, cbuflen);
751	}
752
753	out.setencoding (1); // other encoding
754
755	return out;
756	}
757
758
759	void outconvertclass::setostream (ostream *theouts)
760	{
761	outs = theouts;
762	}
763
764	ostream *outconvertclass::getostream ()
765	{
766	return outs;
767	}
768
769
770
771
772	// an instance of the default outconvertclass to do simple
773	// conversions
774	outconvertclass text_t2ascii;
775
776
777
778	// stream operators for the output class
779
780	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
781	{
782	outconverter.setostream(&theouts);
783	return outconverter;
784	}
785
786
787	#define STREAMBUFSIZE 256
788	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
789	{
790	ostream *outstream = outconverter.getostream();
791
792	if (outstream == NULL) return outconverter;
793
794	char outbuf[STREAMBUFSIZE];
795	size_t len;
796	outconvertclass::status_t status = outconvertclass::unfinished;
797
798	// assume that there is no data needing converting
799	// left in the converter
800	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
801
802	while (status == outconvertclass::unfinished)
803	{
804	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
805	if (len > 0) outstream->write(outbuf, len);
806	}
807
808	return outconverter;
809	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: