Context Navigation

source: main/tags/2.60/gsdl/lib/text_t.cpp@ 23840

Last change on this file since 23840 was 9593, checked in by kjdon, 19 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.9 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 9593 2005-04-07 04:27:18Z kjdon $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.24 2005/04/07 04:27:18 kjdon
31	added some x++ -> ++x changes submitted by Emanuel Dejanu
32
33	Revision 1.23 2004/12/02 22:21:34 kjdon
34	added some changes made by Emanuel Dejanu (Simple Words)
35
36	Revision 1.22 2004/05/24 03:43:22 mdewsnip
37	(Human Info) Added const text_t g_EmptyText("") to use instead of plain ol' "".
38
39	Revision 1.21 2001/06/01 02:51:28 sjboddie
40	Changes to get phind working under windows
41
42	Revision 1.20 2001/01/25 18:26:44 cs025
43	Included CORBA branch for first time
44
45	Revision 1.15.2.2 2000/04/05 10:19:38 syeates
46	added automatic conversion to allow text_t's to be <<'ed to ostreams
47
48	Revision 1.15.2.1 2000/04/04 15:02:29 cs025
49	Corba first commit
50
51	Revision 1.15 1999/10/14 22:52:39 sjboddie
52	joinchar can join using text_t string now too
53
54	Revision 1.14 1999/09/24 02:30:03 rjmcnab
55	added function has_unicode_letdig
56
57	Revision 1.13 1999/09/07 04:57:43 sjboddie
58	added gpl notice
59
60	Revision 1.12 1999/08/31 08:04:41 rjmcnab
61	Fixed a small but hard to find bug in getcarr
62
63	Revision 1.11 1999/07/01 04:05:09 rjmcnab
64	Optimised append functions slightly and added a reserve function.
65
66	Revision 1.10 1999/04/26 03:58:03 sjboddie
67	added is_number function
68
69	Revision 1.9 1999/04/06 22:17:24 rjmcnab
70	Added splits and joins using text_tset.
71
72	Revision 1.8 1999/02/28 23:14:41 rjmcnab
73
74	Added uc and lc to convert to uppercase and lowercase.
75
76	Revision 1.7 1999/02/21 22:26:39 rjmcnab
77
78	Made getint() a constant function.
79
80	Revision 1.6 1999/02/03 01:13:26 sjboddie
81
82	Got interface to handle subcollections and language subcollections -
83	committed changes made to some of the collections
84
85	Revision 1.5 1999/01/19 01:38:14 rjmcnab
86
87	Made the source more portable.
88
89	Revision 1.4 1999/01/12 01:51:00 rjmcnab
90
91	Standard header.
92
93	Revision 1.3 1999/01/08 02:33:16 rjmcnab
94
95	Added standard header to source files.
96
97	*/
98
99	#include "text_t.h"
100
101	#if defined(GSDL_USE_OBJECTSPACE)
102	# include <ospace\std\algorithm>
103	#elif defined(GSDL_USE_STL_H)
104	# if defined(GSDL_USE_ALGO_H)
105	# include <algo.h>
106	# else
107	# include <algorithm.h>
108	# endif
109	#else
110	# include <algorithm>
111	#endif
112
113	#ifdef HAVE_CONFIG_H
114	# ifdef __WIN32__
115	# include "WIN32cfg.h"
116	# else
117	# include "config.h"
118	# endif
119	#endif
120
121
122	#include "unitool.h"
123
124	const text_t g_EmptyText("");
125
126	////////////////////////////////////
127	// text_t methods
128	////////////////////////////////////
129
130	// new stream converter ...
131	ostream& operator<< (ostream &o, const text_t &text)
132	{
133	text_t::const_iterator ithere = text.begin();
134	text_t::const_iterator itend = text.end();
135
136	while (ithere != itend)
137	{
138	if (*ithere < 256)
139	{
140	o << (unsigned char)(*ithere);
141	}
142	else
143	{
144	// put a space or a question mark depending on what
145	// the character is. Question marks tell the user that
146	// they are missing some information.
147	if (is_unicode_space (*ithere))
148	o << ' ';
149	else
150	o << '?';
151	}
152	++ithere;
153	}
154
155	return o;
156	}
157
158	text_t::text_t ()
159	{
160	setencoding(0);
161	clear ();
162	}
163
164	text_t::text_t (int i)
165	{
166	setencoding(0);
167	clear ();
168	appendint (i);
169	}
170
171	text_t::text_t (const char *s)
172	{
173	setencoding(0);
174	clear ();
175	appendcstr (s);
176	}
177
178	text_t::text_t (const char *s, size_type nLength)
179	{
180	setencoding(0);
181	clear ();
182	appendcarr(s, nLength);
183	}
184
185
186	void text_t::append (const text_t &t)
187	{
188	text.insert(text.end(), t.begin(), t.end());
189	}
190
191	void text_t::appendrange (iterator first, iterator last)
192	{
193	text.insert(text.end(), first, last);
194	}
195
196	void text_t::appendrange (const_iterator first, const_iterator last)
197	{
198	text.insert(text.end(), first, last);
199	}
200
201	void text_t::appendint (int i)
202	{
203	// deal with zeros and negatives
204	if (i == 0)
205	{
206	text.push_back('0');
207	return;
208	}
209	else if (i < 0)
210	{
211	text.push_back('-');
212	i *= -1;
213	}
214
215	// get a buffer for the conversion
216	int maxbuflen = sizeof(int)*3;
217	char *buf = new char[maxbuflen];
218	int len = 0;
219
220	// get the number in reverse
221	while (i > 0)
222	{
223	buf[len++] = '0'+ (i%10);
224	i = i/10;
225	}
226
227	// reverse the number
228	while (len > 0)
229	{
230	text.push_back(buf[--len]);
231	}
232
233	delete []buf;
234	}
235
236	int text_t::getint () const
237	{
238	int i = 0;
239	int mult = 1; // become -1 for negative numbers
240
241	const_iterator here = text.begin();
242	const_iterator end = text.end();
243
244	// do plus and minus signs
245	if (here != end)
246	{
247	if (*here == '-')
248	{
249	mult = -1;
250	++here;
251	}
252	else if (*here == '+')
253	{
254	mult = 1;
255	++here;
256	}
257	}
258
259	// deal with the number
260	while ((here != end) && (here >= '0') && (here <= '9'))
261	{
262	i = 10i + (here - '0');
263	++here;
264	}
265
266	i *= mult;
267	return i;
268	}
269
270	unsigned long text_t::getulong () const
271	{
272	unsigned long i = 0;
273
274	const_iterator here = text.begin();
275	const_iterator end = text.end();
276
277	while ((here != end) && (here >= '0') && (here <= '9'))
278	{
279	i = 10i + (here - '0');
280	++here;
281	}
282
283	return i;
284	}
285
286	void text_t::appendcarr (const char *s, size_type len)
287	{
288	unsigned char us = (unsigned char )s;
289	if (text.capacity() < (text.size() + len + 2)) {
290	text.reserve(text.size() + len + 2);
291	}
292
293	while (len > 0)
294	{
295	text.push_back (*us); // append this character
296	++us;
297	--len;
298	}
299	}
300
301	void text_t::appendcstr (const char *s)
302	{
303	size_t len = strlen(s);
304	if (text.capacity() < (text.size() + len + 2)) {
305	text.reserve(text.size() + len + 2);
306	}
307
308	unsigned char us = (unsigned char )s;
309	while (*us != '\0')
310	{
311	text.push_back (*us); // append this character
312	++us;
313	}
314	}
315
316
317	// strings returned from getcarr and getcstr become the callers
318	// responsibility and should be deallocated with "delete []"
319
320	char *text_t::getcarr(size_type &len) const
321	{
322	unsigned char *cstr = new unsigned char[size()];
323	len = 0;
324
325	const_iterator ithere = begin();
326	const_iterator itend = end();
327	while (ithere != itend)
328	{
329	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
330	else {
331	// put a space or a question mark depending on what
332	// the character is. Question marks tell the user that
333	// they are missing some information.
334	if (is_unicode_space (*ithere)) cstr[len] = ' ';
335	else cstr[len] = '?';
336	}
337	++len;
338	++ithere;
339	}
340
341	return (char *)cstr;
342	}
343
344	char *text_t::getcstr() const
345	{
346	unsigned char *cstr = new unsigned char[size() + 1];
347	const_iterator ithere = begin();
348	const_iterator itend = end();
349	int len = 0;
350
351	while (ithere != itend)
352	{
353	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
354	else {
355	// put a space or a question mark depending on what
356	// the character is. Question marks tell the user that
357	// they are missing some information.
358	if (is_unicode_space (*ithere)) cstr[len] = ' ';
359	else cstr[len] = '?';
360	}
361	++len;
362	++ithere;
363	}
364
365	cstr[len] = '\0';
366
367	return (char *)cstr;
368	}
369
370
371	// general functions which work on text_ts
372
373	// find a character within a range
374	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
375	unsigned short c)
376	{
377	while (first != last)
378	{
379	if (*first == c) break;
380	++first;
381	}
382	return first;
383	}
384
385	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
386	unsigned short c)
387	{
388	while (first != last)
389	{
390	if (*first == c) break;
391	++first;
392	}
393	return first;
394	}
395
396	text_t::iterator findword (text_t::iterator first,
397	text_t::iterator last,
398	const text_t& word)
399	{
400	text_t::const_iterator word_begin = word.begin();
401	text_t::const_iterator word_end = word.end();
402
403	while (first != last)
404	{
405	text_t::iterator char_match = first;
406	text_t::const_iterator word_here = word_begin;
407	while (word_here!=word_end)
408	{
409	if (char_match != word_here)
410	{
411	break;
412	}
413	++char_match;
414	++word_here;
415	}
416	if (word_here==word_end)
417	{
418	return first;
419	}
420	++first;
421	}
422	return last; // get to here only if there is no match
423	}
424
425	// get a string up to the next delimiter (which is skipped)
426	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
427	text_t::const_iterator last,
428	unsigned short c, text_t &outstr)
429	{
430	text_t::const_iterator here = first;
431	here = findchar (first, last, c);
432	outstr.clear();
433	outstr.appendrange (first, here);
434	if (here != last) ++here; // skip c
435	return here;
436	}
437
438	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
439	unsigned short c, text_t &outstr)
440	{
441	text_t::iterator here = first;
442	here = findchar (first, last, c);
443	outstr.clear();
444	outstr.appendrange (first, here);
445	if (here != last) ++here; // skip c
446	return here;
447	}
448
449	// split a string with a character
450	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
451	unsigned short c, text_tset &outlist)
452	{
453	outlist.erase(outlist.begin(), outlist.end());
454
455	text_t t;
456
457	while (first != last)
458	{
459	first = getdelimitstr (first, last, c, t);
460	outlist.insert (t);
461	}
462	}
463
464	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
465	unsigned short c, text_tlist &outlist)
466	{
467	outlist.erase(outlist.begin(), outlist.end());
468
469	text_t t;
470
471	while (first != last)
472	{
473	first = getdelimitstr (first, last, c, t);
474	outlist.push_back (t);
475	}
476	}
477
478	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
479	unsigned short c, text_tarray &outlist)
480	{
481	outlist.erase(outlist.begin(), outlist.end());
482
483	text_t t;
484
485	while (first != last)
486	{
487	first = getdelimitstr (first, last, c, t);
488	outlist.push_back (t);
489	}
490	}
491
492	// join a string using a character
493	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
494	{
495	outtext.clear ();
496
497	text_tset::const_iterator here = inlist.begin ();
498	text_tset::const_iterator end = inlist.end ();
499
500	if (here != end) {
501	outtext += *here; ++here;
502	while (here != end) {
503	outtext.push_back (c);
504	outtext += *here;
505	++here;
506	}
507	}
508	}
509
510	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
511	{
512	outtext.clear ();
513
514	text_tlist::const_iterator here = inlist.begin ();
515	text_tlist::const_iterator end = inlist.end ();
516	if (here != end) {
517	outtext += *here; ++here;
518	while (here != end) {
519	outtext.push_back (c);
520	outtext += *here;
521	++here;
522	}
523	}
524	}
525
526	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
527	{
528	outtext.clear ();
529
530	text_tarray::const_iterator here = inlist.begin ();
531	text_tarray::const_iterator end = inlist.end ();
532	if (here != end) {
533	outtext += *here; ++here;
534	while (here != end) {
535	outtext.push_back (c);
536	outtext += *here;
537	++here;
538	}
539	}
540	}
541
542	void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
543	{
544	outtext.clear ();
545
546	text_tlist::const_iterator here = inlist.begin ();
547	text_tlist::const_iterator end = inlist.end ();
548	if (here != end) {
549	outtext += *here; ++here;
550	while (here != end) {
551	outtext += c;
552	outtext += *here;
553	++here;
554	}
555	}
556	}
557
558	void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
559	{
560	outtext.clear ();
561
562	text_tset::const_iterator here = inlist.begin ();
563	text_tset::const_iterator end = inlist.end ();
564	if (here != end) {
565	outtext += *here; ++here;
566	while (here != end) {
567	outtext += c;
568	outtext += *here;
569	++here;
570	}
571	}
572	}
573
574	void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
575	{
576	outtext.clear ();
577
578	text_tarray::const_iterator here = inlist.begin ();
579	text_tarray::const_iterator end = inlist.end ();
580	if (here != end) {
581	outtext += *here; ++here;
582	while (here != end) {
583	outtext += c;
584	outtext += *here;
585	++here;
586	}
587	}
588	}
589
590	// count the occurances of a character within a range
591	int countchar (text_t::const_iterator first, text_t::const_iterator last,
592	unsigned short c)
593	{
594	int count = 0;
595	while (first != last) {
596	if (*first == c) ++count;
597	++first;
598	}
599	return count;
600	}
601
602	// return a substring of string from first up to but not including last
603	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
604
605	text_t substr; substr.reserve(last - first + 2);
606	while (first != last) {
607	substr.push_back(*first);
608	++first;
609	}
610	return substr;
611	}
612
613
614	// convert to lowercase
615	void lc (text_t::iterator first, text_t::iterator last) {
616	while (first != last) {
617	first = unicode_tolower(first);
618	++first;
619	}
620	}
621
622	// convert to uppercase
623	void uc (text_t::iterator first, text_t::iterator last) {
624	while (first != last) {
625	first = unicode_toupper(first);
626	++first;
627	}
628	}
629
630
631	// checks to see if it is a number (i.e. contains only 0-9)
632	bool is_number (const text_t &text) {
633
634	text_t::const_iterator here = text.begin();
635	text_t::const_iterator end = text.end();
636
637	while (here != end) {
638	if ((here!='0') && (here!='1') && (*here!='2') &&
639	(here!='3') && (here!='4') && (*here!='5') &&
640	(here!='6') && (here!='7') && (*here!='8') &&
641	(*here!='9')) return false;
642	++here;
643	}
644	return true;
645	}
646
647
648	// checks to see if the text has any letters or digits
649	bool has_unicode_letdig (const text_t &text) {
650	if (text.empty()) return false;
651
652	text_t::const_iterator here = text.begin();
653	text_t::const_iterator end = text.end();
654	while (here != end) {
655	if (is_unicode_letdig (*here)) return true;
656	++here;
657	}
658
659	return false;
660	}
661
662
663
664	////////////////////////////////////
665	// convertclass methods
666	////////////////////////////////////
667
668	// conversion classes used for getting information in to and out of
669	// the text_t class.
670
671	convertclass::convertclass ()
672	{
673	// nothing to do
674	}
675
676	void convertclass::reset ()
677	{
678	// nothing to do
679	}
680
681
682	////////////////////////////////////
683	// inconvertclass methods
684	////////////////////////////////////
685
686	// convert from a char stream to the text_t class
687	// the default version assumes the input is a ascii
688	// character array
689
690	inconvertclass::inconvertclass ()
691	{
692	start = NULL;
693	len = 0;
694	}
695
696
697	void inconvertclass::reset ()
698	{
699	start = NULL;
700	len = 0;
701	}
702
703	void inconvertclass::setinput (char *thestart, size_t thelen)
704	{
705	start = thestart;
706	len = thelen;
707	}
708
709	void inconvertclass::convert (text_t &output, status_t &status)
710	{
711	output.clear();
712
713	if (start == NULL \|\| len == 0)
714	{
715	status = finished;
716	return;
717	}
718
719	if (output.capacity() < len + 2)
720	output.reserve(len + 2);
721
722	// don't want any funny sign conversions happening
723	unsigned char here = (unsigned char )start;
724	while (len > 0)
725	{
726	output.push_back (*here); // append this character
727	++here;
728	--len;
729	}
730
731	start = (char *)here; // save current position
732	status = finished;
733	}
734
735	// will treat the text_t as a 8-bit string and convert
736	// it to a 16-bit string using the about convert method.
737	text_t inconvertclass::convert (const text_t &t) {
738	text_t out;
739	text_t tmpout;
740	status_t status;
741	text_t::const_iterator here = t.begin();
742	text_t::const_iterator end = t.end();
743	unsigned char cbuf[256];
744	size_t cbuflen = 0;
745
746	out.clear();
747	if (out.capacity() < t.size() + 2)
748	out.reserve(t.size() + 2);
749	while (here != end) {
750	while (here != end && cbuflen < 256) {
751	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
752	++here;
753	}
754
755	if (cbuflen > 0) {
756	setinput ((char *)cbuf, cbuflen);
757	status = unfinished;
758	while (status == unfinished) {
759	convert (tmpout, status);
760	out += tmpout;
761	}
762	cbuflen = 0;
763	}
764	}
765
766	out.setencoding (0); // unicode
767
768	return out;
769	}
770
771	// an instance of the default inconvertclass to do simple
772	// conversions. Note that any functions that use this are
773	// not reentrant. If a function needs to be reentrant it
774	// should declare its own instance.
775	inconvertclass ascii2text_t;
776
777
778	////////////////////////////////////
779	// outconvertclass methods
780	////////////////////////////////////
781
782	// Convert from a text_t class to a char stream
783	// This default version assumes the output is a ascii
784	// character array. If you set the output stream you
785	// can use this class to output to a stream using the
786	// << operator. The << operator can also be conveniently
787	// used to set the output stream by doing something like
788	//
789	// cout << text_t2ascii << text_tstr << anothertext_tstr;
790	//
791	outconvertclass::outconvertclass ()
792	{
793	input = NULL;
794	outs = NULL;
795	}
796
797	void outconvertclass::reset ()
798	{
799	input = NULL;
800	outs = NULL;
801	}
802
803	void outconvertclass::setinput (text_t *theinput)
804	{
805	input = theinput;
806	if (input != NULL) texthere = input->begin();
807	}
808
809	void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
810	{
811	input = theinput;
812	texthere = thetexthere;
813	}
814
815	void outconvertclass::convert (char *output, size_t maxlen,
816	size_t &len, status_t &status)
817	{
818	if (input == NULL \|\| output == NULL)
819	{
820	status = finished;
821	return;
822	}
823
824	// don't want any funny sign conversions happening
825	unsigned char uoutput = (unsigned char )output;
826	text_t::iterator textend = input->end();
827	len = 0;
828	while ((len < maxlen) && (texthere != textend))
829	{
830	if (texthere < 256) uoutput = (unsigned char)(*texthere);
831	else {
832	// put a space or a question mark depending on what
833	// the character is. Question marks tell the user that
834	// they are missing some information.
835	if (is_unicode_space (texthere)) uoutput = ' ';
836	else *uoutput = '?';
837	}
838	++uoutput;
839	++len;
840	++texthere;
841	}
842
843	if (texthere == textend) status = finished;
844	else status = unfinished;
845	}
846
847	// will convert the 16-bit string to a 8-bit stream
848	// and place the result in a text_t. This method uses
849	// the above convert function.
850	text_t outconvertclass::convert (const text_t &t) {
851	text_t out;
852	unsigned char cbuf[256];
853	size_t cbuflen = 0;
854	status_t status = unfinished;
855
856	out.clear();
857	if (out.capacity() < t.size() + 2)
858	out.reserve(t.size() + 2);
859	setinput ((text_t *)&t); // discard constant
860	while (status == unfinished) {
861	convert ((char *)cbuf, 256, cbuflen, status);
862	out.appendcarr ((char *)cbuf, cbuflen);
863	}
864
865	out.setencoding (1); // other encoding
866
867	return out;
868	}
869
870
871	void outconvertclass::setostream (ostream *theouts)
872	{
873	outs = theouts;
874	}
875
876	ostream *outconvertclass::getostream ()
877	{
878	return outs;
879	}
880
881
882
883
884	// an instance of the default outconvertclass to do simple
885	// conversions
886	outconvertclass text_t2ascii;
887
888
889
890	// stream operators for the output class
891
892	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
893	{
894	outconverter.setostream(&theouts);
895	return outconverter;
896	}
897
898
899	#define STREAMBUFSIZE 256
900	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
901	{
902	ostream *outstream = outconverter.getostream();
903
904	if (outstream == NULL) return outconverter;
905
906	char outbuf[STREAMBUFSIZE];
907	size_t len;
908	outconvertclass::status_t status = outconvertclass::unfinished;
909
910	// assume that there is no data needing converting
911	// left in the converter
912	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
913
914	while (status == outconvertclass::unfinished)
915	{
916	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
917	if (len > 0) outstream->write(outbuf, len);
918	}
919
920	return outconverter;
921	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: