Context Navigation

source: main/tags/2.13/gsdl/lib/text_t.cpp@ 24552

Last change on this file since 24552 was 665, checked in by sjboddie, 25 years ago
joinchar can join using text_t string now too
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.6 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 665 1999-10-14 22:52:39Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.15 1999/10/14 22:52:39 sjboddie
31	joinchar can join using text_t string now too
32
33	Revision 1.14 1999/09/24 02:30:03 rjmcnab
34	added function has_unicode_letdig
35
36	Revision 1.13 1999/09/07 04:57:43 sjboddie
37	added gpl notice
38
39	Revision 1.12 1999/08/31 08:04:41 rjmcnab
40	Fixed a small but hard to find bug in getcarr
41
42	Revision 1.11 1999/07/01 04:05:09 rjmcnab
43	Optimised append functions slightly and added a reserve function.
44
45	Revision 1.10 1999/04/26 03:58:03 sjboddie
46	added is_number function
47
48	Revision 1.9 1999/04/06 22:17:24 rjmcnab
49	Added splits and joins using text_tset.
50
51	Revision 1.8 1999/02/28 23:14:41 rjmcnab
52
53	Added uc and lc to convert to uppercase and lowercase.
54
55	Revision 1.7 1999/02/21 22:26:39 rjmcnab
56
57	Made getint() a constant function.
58
59	Revision 1.6 1999/02/03 01:13:26 sjboddie
60
61	Got interface to handle subcollections and language subcollections -
62	committed changes made to some of the collections
63
64	Revision 1.5 1999/01/19 01:38:14 rjmcnab
65
66	Made the source more portable.
67
68	Revision 1.4 1999/01/12 01:51:00 rjmcnab
69
70	Standard header.
71
72	Revision 1.3 1999/01/08 02:33:16 rjmcnab
73
74	Added standard header to source files.
75
76	*/
77
78
79	#include "text_t.h"
80
81	#if defined(GSDL_USE_OBJECTSPACE)
82	# include <ospace\std\algorithm>
83	#elif defined(GSDL_USE_STL_H)
84	# if defined(GSDL_USE_ALGO_H)
85	# include <algo.h>
86	# else
87	# include <algorithm.h>
88	# endif
89	#else
90	# include <algorithm>
91	#endif
92
93
94	#include "unitool.h"
95
96	////////////////////////////////////
97	// text_t methods
98	////////////////////////////////////
99
100	text_t::text_t ()
101	{
102	setencoding(0);
103	clear ();
104	}
105
106	text_t::text_t (int i)
107	{
108	setencoding(0);
109	clear ();
110	appendint (i);
111	}
112
113	text_t::text_t (char *s)
114	{
115	setencoding(0);
116	clear ();
117	appendcstr (s);
118	}
119
120	void text_t::append (const text_t &t)
121	{
122	text.insert(text.end(), t.begin(), t.end());
123	// const_iterator here, end=t.end();
124	// for (here=t.begin(); here!=end;here++)
125	// {
126	// text.push_back(*here);
127	// }
128	}
129
130	void text_t::appendrange (iterator first, iterator last)
131	{
132	text.insert(text.end(), first, last);
133	// while (first != last)
134	// {
135	// text.push_back (*first);
136	// first++;
137	// }
138	}
139
140	void text_t::appendrange (const_iterator first, const_iterator last)
141	{
142	text.insert(text.end(), first, last);
143	// while (first != last)
144	// {
145	// text.push_back (*first);
146	// first++;
147	// }
148	}
149
150	void text_t::appendint (int i)
151	{
152	// deal with zeros and negatives
153	if (i == 0)
154	{
155	text.push_back('0');
156	return;
157	}
158	else if (i < 0)
159	{
160	text.push_back('-');
161	i *= -1;
162	}
163
164	// get a buffer for the conversion
165	int maxbuflen = sizeof(int)*3;
166	char *buf = new char[maxbuflen];
167	int len = 0;
168
169	// get the number in reverse
170	while (i > 0)
171	{
172	buf[len++] = '0'+ (i%10);
173	i = i/10;
174	}
175
176	// reverse the number
177	while (len > 0)
178	{
179	text.push_back(buf[--len]);
180	}
181
182	delete buf;
183	}
184
185	int text_t::getint () const
186	{
187	int i = 0;
188	int mult = 1; // become -1 for negative numbers
189
190	const_iterator here = text.begin();
191	const_iterator end = text.end();
192
193	// do plus and minus signs
194	if (here != end)
195	{
196	if (*here == '-')
197	{
198	mult = -1;
199	here++;
200	}
201	else if (*here == '+')
202	{
203	mult = 1;
204	here++;
205	}
206	}
207
208	// deal with the number
209	while ((here != end) && (here >= '0') && (here <= '9'))
210	{
211	i = 10i + (here - '0');
212	here++;
213	}
214
215	i *= mult;
216	return i;
217	}
218
219
220
221	void text_t::appendcarr (char *s, size_type len)
222	{
223	unsigned char us = (unsigned char )s;
224	while (len > 0)
225	{
226	text.push_back (*us); // append this character
227	us++;
228	len--;
229	}
230	}
231
232	void text_t::appendcstr (char *s)
233	{
234	unsigned char us = (unsigned char )s;
235	while (*us != '\0')
236	{
237	text.push_back (*us); // append this character
238	us++;
239	}
240	}
241
242
243	// strings returned from getcarr and getcstr become the callers
244	// responsibility and should be deallocated with "delete"
245
246	char *text_t::getcarr(size_type &len) const
247	{
248	unsigned char *cstr = new unsigned char[size()];
249	len = 0;
250
251	const_iterator ithere = begin();
252	const_iterator itend = end();
253	while (ithere != itend)
254	{
255	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
256	else {
257	// put a space or a question mark depending on what
258	// the character is. Question marks tell the user that
259	// they are missing some information.
260	if (is_unicode_space (*ithere)) cstr[len] = ' ';
261	else cstr[len] = '?';
262	}
263	len++;
264	ithere++;
265	}
266
267	return (char *)cstr;
268	}
269
270	char *text_t::getcstr() const
271	{
272	unsigned char *cstr = new unsigned char[size() + 1];
273	const_iterator ithere = begin();
274	const_iterator itend = end();
275	int len = 0;
276
277	while (ithere != itend)
278	{
279	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
280	else {
281	// put a space or a question mark depending on what
282	// the character is. Question marks tell the user that
283	// they are missing some information.
284	if (is_unicode_space (*ithere)) cstr[len] = ' ';
285	else cstr[len] = '?';
286	}
287	len++;
288	ithere++;
289	}
290
291	cstr[len] = '\0';
292
293	return (char *)cstr;
294	}
295
296
297	// general functions which work on text_ts
298
299	// find a character within a range
300	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
301	unsigned short c)
302	{
303	while (first != last)
304	{
305	if (*first == c) break;
306	first++;
307	}
308	return first;
309	}
310
311	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
312	unsigned short c)
313	{
314	while (first != last)
315	{
316	if (*first == c) break;
317	first++;
318	}
319	return first;
320	}
321
322	// get a string up to the next delimiter (which is skipped)
323	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
324	text_t::const_iterator last,
325	unsigned short c, text_t &outstr)
326	{
327	text_t::const_iterator here = first;
328	here = findchar (first, last, c);
329	outstr.clear();
330	outstr.appendrange (first, here);
331	if (here != last) here++; // skip c
332	return here;
333	}
334
335	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
336	unsigned short c, text_t &outstr)
337	{
338	text_t::iterator here = first;
339	here = findchar (first, last, c);
340	outstr.clear();
341	outstr.appendrange (first, here);
342	if (here != last) here++; // skip c
343	return here;
344	}
345
346	// split a string with a character
347	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
348	unsigned short c, text_tset &outlist)
349	{
350	outlist.erase(outlist.begin(), outlist.end());
351
352	text_t t;
353
354	while (first != last)
355	{
356	first = getdelimitstr (first, last, c, t);
357	outlist.insert (t);
358	}
359	}
360
361	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
362	unsigned short c, text_tlist &outlist)
363	{
364	outlist.erase(outlist.begin(), outlist.end());
365
366	text_t t;
367
368	while (first != last)
369	{
370	first = getdelimitstr (first, last, c, t);
371	outlist.push_back (t);
372	}
373	}
374
375	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
376	unsigned short c, text_tarray &outlist)
377	{
378	outlist.erase(outlist.begin(), outlist.end());
379
380	text_t t;
381
382	while (first != last)
383	{
384	first = getdelimitstr (first, last, c, t);
385	outlist.push_back (t);
386	}
387	}
388
389	// join a string using a character
390	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
391	{
392	outtext.clear ();
393
394	text_tset::const_iterator here = inlist.begin ();
395	text_tset::const_iterator end = inlist.end ();
396	bool first = true;
397	while (here != end)
398	{
399	if (!first) outtext.push_back (c);
400	first = false;
401	outtext += *here;
402	here++;
403	}
404	}
405
406	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
407	{
408	outtext.clear ();
409
410	text_tlist::const_iterator here = inlist.begin ();
411	text_tlist::const_iterator end = inlist.end ();
412	bool first = true;
413	while (here != end)
414	{
415	if (!first) outtext.push_back (c);
416	first = false;
417	outtext += *here;
418	here++;
419	}
420	}
421
422	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
423	{
424	outtext.clear ();
425
426	text_tarray::const_iterator here = inlist.begin ();
427	text_tarray::const_iterator end = inlist.end ();
428	bool first = true;
429	while (here != end)
430	{
431	if (!first) outtext.push_back (c);
432	first = false;
433	outtext += *here;
434	here++;
435	}
436	}
437
438	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
439	{
440	outtext.clear ();
441
442	text_tarray::const_iterator here = inlist.begin ();
443	text_tarray::const_iterator end = inlist.end ();
444	bool first = true;
445	while (here != end)
446	{
447	if (!first) outtext += c;
448	first = false;
449	outtext += *here;
450	here++;
451	}
452	}
453
454	// count the occurances of a character within a range
455	int countchar (text_t::const_iterator first, text_t::const_iterator last,
456	unsigned short c)
457	{
458	int count = 0;
459	while (first != last) {
460	if (*first == c) count ++;
461	first ++;
462	}
463	return count;
464	}
465
466	// return a substring of string from first up to but not including last
467	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
468
469	text_t substr;
470	while (first != last) {
471	substr.push_back(*first);
472	first ++;
473	}
474	return substr;
475	}
476
477
478	// convert to lowercase
479	void lc (text_t::iterator first, text_t::iterator last) {
480	while (first != last) {
481	first = unicode_tolower(first);
482	first++;
483	}
484	}
485
486	// convert to uppercase
487	void uc (text_t::iterator first, text_t::iterator last) {
488	while (first != last) {
489	first = unicode_toupper(first);
490	first++;
491	}
492	}
493
494
495	// checks to see if it is a number (i.e. contains only 0-9)
496	bool is_number (const text_t &text) {
497
498	text_t::const_iterator here = text.begin();
499	text_t::const_iterator end = text.end();
500
501	while (here != end) {
502	if ((here!='0') && (here!='1') && (*here!='2') &&
503	(here!='3') && (here!='4') && (*here!='5') &&
504	(here!='6') && (here!='7') && (*here!='8') &&
505	(*here!='9')) return false;
506	here ++;
507	}
508	return true;
509	}
510
511
512	// checks to see if the text has any letters or digits
513	bool has_unicode_letdig (const text_t &text) {
514	if (text.empty()) return false;
515
516	text_t::const_iterator here = text.begin();
517	text_t::const_iterator end = text.end();
518	while (here != end) {
519	if (is_unicode_letdig (*here)) return true;
520	here++;
521	}
522
523	return false;
524	}
525
526
527
528	////////////////////////////////////
529	// convertclass methods
530	////////////////////////////////////
531
532	// conversion classes used for getting information in to and out of
533	// the text_t class.
534
535	convertclass::convertclass ()
536	{
537	// nothing to do
538	}
539
540	void convertclass::reset ()
541	{
542	// nothing to do
543	}
544
545
546	////////////////////////////////////
547	// inconvertclass methods
548	////////////////////////////////////
549
550	// convert from a char stream to the text_t class
551	// the default version assumes the input is a ascii
552	// character array
553
554	inconvertclass::inconvertclass ()
555	{
556	start = NULL;
557	len = 0;
558	}
559
560
561	void inconvertclass::reset ()
562	{
563	start = NULL;
564	len = 0;
565	}
566
567	void inconvertclass::setinput (char *thestart, size_t thelen)
568	{
569	start = thestart;
570	len = thelen;
571	}
572
573	void inconvertclass::convert (text_t &output, status_t &status)
574	{
575	output.clear();
576
577	if (start == NULL \|\| len == 0)
578	{
579	status = finished;
580	return;
581	}
582
583	// don't want any funny sign conversions happening
584	unsigned char here = (unsigned char )start;
585	while (len > 0)
586	{
587	output.push_back (*here); // append this character
588	++here;
589	--len;
590	}
591
592	start = (char *)here; // save current position
593	status = finished;
594	}
595
596	// will treat the text_t as a 8-bit string and convert
597	// it to a 16-bit string using the about convert method.
598	text_t inconvertclass::convert (const text_t &t) {
599	text_t out;
600	text_t tmpout;
601	status_t status;
602	text_t::const_iterator here = t.begin();
603	text_t::const_iterator end = t.end();
604	unsigned char cbuf[256];
605	size_t cbuflen = 0;
606
607	while (here != end) {
608	while (here != end && cbuflen < 256) {
609	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
610	here++;
611	}
612
613	if (cbuflen > 0) {
614	setinput ((char *)cbuf, cbuflen);
615	status = unfinished;
616	while (status == unfinished) {
617	convert (tmpout, status);
618	out += tmpout;
619	}
620	cbuflen = 0;
621	}
622	}
623
624	out.setencoding (0); // unicode
625
626	return out;
627	}
628
629	// an instance of the default inconvertclass to do simple
630	// conversions. Note that any functions that use this are
631	// not reentrant. If a function needs to be reentrant it
632	// should declare its own instance.
633	inconvertclass ascii2text_t;
634
635
636	////////////////////////////////////
637	// outconvertclass methods
638	////////////////////////////////////
639
640	// Convert from a text_t class to a char stream
641	// This default version assumes the output is a ascii
642	// character array. If you set the output stream you
643	// can use this class to output to a stream using the
644	// << operator. The << operator can also be conveniently
645	// used to set the output stream by doing something like
646	//
647	// cout << text_t2ascii << text_tstr << anothertext_tstr;
648	//
649	outconvertclass::outconvertclass ()
650	{
651	input = NULL;
652	outs = NULL;
653	}
654
655	void outconvertclass::reset ()
656	{
657	input = NULL;
658	outs = NULL;
659	}
660
661	void outconvertclass::setinput (text_t *theinput)
662	{
663	input = theinput;
664	if (input != NULL) texthere = input->begin();
665	}
666
667	void outconvertclass::convert (char *output, size_t maxlen,
668	size_t &len, status_t &status)
669	{
670	if (input == NULL \|\| output == NULL)
671	{
672	status = finished;
673	return;
674	}
675
676	// don't want any funny sign conversions happening
677	unsigned char uoutput = (unsigned char )output;
678	text_t::iterator textend = input->end();
679	len = 0;
680	while ((len < maxlen) && (texthere != textend))
681	{
682	if (texthere < 256) uoutput = (unsigned char)(*texthere);
683	else {
684	// put a space or a question mark depending on what
685	// the character is. Question marks tell the user that
686	// they are missing some information.
687	if (is_unicode_space (texthere)) uoutput = ' ';
688	else *uoutput = '?';
689	}
690	++uoutput;
691	++len;
692	++texthere;
693	}
694
695	if (texthere == textend) status = finished;
696	else status = unfinished;
697	}
698
699	// will convert the 16-bit string to a 8-bit stream
700	// and place the result in a text_t. This method uses
701	// the above convert function.
702	text_t outconvertclass::convert (const text_t &t) {
703	text_t out;
704	unsigned char cbuf[256];
705	size_t cbuflen = 0;
706	status_t status = unfinished;
707
708	setinput ((text_t *)&t); // discard constant
709	while (status == unfinished) {
710	convert ((char *)cbuf, 256, cbuflen, status);
711	out.appendcarr ((char *)cbuf, cbuflen);
712	}
713
714	out.setencoding (1); // other encoding
715
716	return out;
717	}
718
719
720	void outconvertclass::setostream (ostream *theouts)
721	{
722	outs = theouts;
723	}
724
725	ostream *outconvertclass::getostream ()
726	{
727	return outs;
728	}
729
730
731
732
733	// an instance of the default outconvertclass to do simple
734	// conversions
735	outconvertclass text_t2ascii;
736
737
738
739	// stream operators for the output class
740
741	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
742	{
743	outconverter.setostream(&theouts);
744	return outconverter;
745	}
746
747
748	#define STREAMBUFSIZE 256
749	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
750	{
751	ostream *outstream = outconverter.getostream();
752
753	if (outstream == NULL) return outconverter;
754
755	char outbuf[STREAMBUFSIZE];
756	size_t len;
757	outconvertclass::status_t status = outconvertclass::unfinished;
758
759	// assume that there is no data needing converting
760	// left in the converter
761	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
762
763	while (status == outconvertclass::unfinished)
764	{
765	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
766	if (len > 0) outstream->write(outbuf, len);
767	}
768
769	return outconverter;
770	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: