Context Navigation

source: main/tags/2.53/gsdl/lib/text_t.cpp@ 32727

Last change on this file since 32727 was 8727, checked in by kjdon, 19 years ago
added some changes made by Emanuel Dejanu (Simple Words)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.8 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 8727 2004-12-02 22:21:34Z kjdon $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.23 2004/12/02 22:21:34 kjdon
31	added some changes made by Emanuel Dejanu (Simple Words)
32
33	Revision 1.22 2004/05/24 03:43:22 mdewsnip
34	(Human Info) Added const text_t g_EmptyText("") to use instead of plain ol' "".
35
36	Revision 1.21 2001/06/01 02:51:28 sjboddie
37	Changes to get phind working under windows
38
39	Revision 1.20 2001/01/25 18:26:44 cs025
40	Included CORBA branch for first time
41
42	Revision 1.15.2.2 2000/04/05 10:19:38 syeates
43	added automatic conversion to allow text_t's to be <<'ed to ostreams
44
45	Revision 1.15.2.1 2000/04/04 15:02:29 cs025
46	Corba first commit
47
48	Revision 1.15 1999/10/14 22:52:39 sjboddie
49	joinchar can join using text_t string now too
50
51	Revision 1.14 1999/09/24 02:30:03 rjmcnab
52	added function has_unicode_letdig
53
54	Revision 1.13 1999/09/07 04:57:43 sjboddie
55	added gpl notice
56
57	Revision 1.12 1999/08/31 08:04:41 rjmcnab
58	Fixed a small but hard to find bug in getcarr
59
60	Revision 1.11 1999/07/01 04:05:09 rjmcnab
61	Optimised append functions slightly and added a reserve function.
62
63	Revision 1.10 1999/04/26 03:58:03 sjboddie
64	added is_number function
65
66	Revision 1.9 1999/04/06 22:17:24 rjmcnab
67	Added splits and joins using text_tset.
68
69	Revision 1.8 1999/02/28 23:14:41 rjmcnab
70
71	Added uc and lc to convert to uppercase and lowercase.
72
73	Revision 1.7 1999/02/21 22:26:39 rjmcnab
74
75	Made getint() a constant function.
76
77	Revision 1.6 1999/02/03 01:13:26 sjboddie
78
79	Got interface to handle subcollections and language subcollections -
80	committed changes made to some of the collections
81
82	Revision 1.5 1999/01/19 01:38:14 rjmcnab
83
84	Made the source more portable.
85
86	Revision 1.4 1999/01/12 01:51:00 rjmcnab
87
88	Standard header.
89
90	Revision 1.3 1999/01/08 02:33:16 rjmcnab
91
92	Added standard header to source files.
93
94	*/
95
96	#include "text_t.h"
97
98	#if defined(GSDL_USE_OBJECTSPACE)
99	# include <ospace\std\algorithm>
100	#elif defined(GSDL_USE_STL_H)
101	# if defined(GSDL_USE_ALGO_H)
102	# include <algo.h>
103	# else
104	# include <algorithm.h>
105	# endif
106	#else
107	# include <algorithm>
108	#endif
109
110	#ifdef HAVE_CONFIG_H
111	# ifdef __WIN32__
112	# include "WIN32cfg.h"
113	# else
114	# include "config.h"
115	# endif
116	#endif
117
118
119	#include "unitool.h"
120
121	const text_t g_EmptyText("");
122
123	////////////////////////////////////
124	// text_t methods
125	////////////////////////////////////
126
127	// new stream converter ...
128	ostream& operator<< (ostream &o, const text_t &text)
129	{
130	text_t::const_iterator ithere = text.begin();
131	text_t::const_iterator itend = text.end();
132
133	while (ithere != itend)
134	{
135	if (*ithere < 256)
136	{
137	o << (unsigned char)(*ithere);
138	}
139	else
140	{
141	// put a space or a question mark depending on what
142	// the character is. Question marks tell the user that
143	// they are missing some information.
144	if (is_unicode_space (*ithere))
145	o << ' ';
146	else
147	o << '?';
148	}
149	++ithere;
150	}
151
152	return o;
153	}
154
155	text_t::text_t ()
156	{
157	setencoding(0);
158	clear ();
159	}
160
161	text_t::text_t (int i)
162	{
163	setencoding(0);
164	clear ();
165	appendint (i);
166	}
167
168	text_t::text_t (const char *s)
169	{
170	setencoding(0);
171	clear ();
172	appendcstr (s);
173	}
174
175	text_t::text_t (const char *s, size_type nLength)
176	{
177	setencoding(0);
178	clear ();
179	appendcarr(s, nLength);
180	}
181
182
183	void text_t::append (const text_t &t)
184	{
185	text.insert(text.end(), t.begin(), t.end());
186	}
187
188	void text_t::appendrange (iterator first, iterator last)
189	{
190	text.insert(text.end(), first, last);
191	}
192
193	void text_t::appendrange (const_iterator first, const_iterator last)
194	{
195	text.insert(text.end(), first, last);
196	}
197
198	void text_t::appendint (int i)
199	{
200	// deal with zeros and negatives
201	if (i == 0)
202	{
203	text.push_back('0');
204	return;
205	}
206	else if (i < 0)
207	{
208	text.push_back('-');
209	i *= -1;
210	}
211
212	// get a buffer for the conversion
213	int maxbuflen = sizeof(int)*3;
214	char *buf = new char[maxbuflen];
215	int len = 0;
216
217	// get the number in reverse
218	while (i > 0)
219	{
220	buf[len++] = '0'+ (i%10);
221	i = i/10;
222	}
223
224	// reverse the number
225	while (len > 0)
226	{
227	text.push_back(buf[--len]);
228	}
229
230	delete []buf;
231	}
232
233	int text_t::getint () const
234	{
235	int i = 0;
236	int mult = 1; // become -1 for negative numbers
237
238	const_iterator here = text.begin();
239	const_iterator end = text.end();
240
241	// do plus and minus signs
242	if (here != end)
243	{
244	if (*here == '-')
245	{
246	mult = -1;
247	here++;
248	}
249	else if (*here == '+')
250	{
251	mult = 1;
252	++here;
253	}
254	}
255
256	// deal with the number
257	while ((here != end) && (here >= '0') && (here <= '9'))
258	{
259	i = 10i + (here - '0');
260	++here;
261	}
262
263	i *= mult;
264	return i;
265	}
266
267	unsigned long text_t::getulong () const
268	{
269	unsigned long i = 0;
270
271	const_iterator here = text.begin();
272	const_iterator end = text.end();
273
274	while ((here != end) && (here >= '0') && (here <= '9'))
275	{
276	i = 10i + (here - '0');
277	++here;
278	}
279
280	return i;
281	}
282
283	void text_t::appendcarr (const char *s, size_type len)
284	{
285	unsigned char us = (unsigned char )s;
286	if (text.capacity() < (text.size() + len + 2)) {
287	text.reserve(text.size() + len + 2);
288	}
289
290	while (len > 0)
291	{
292	text.push_back (*us); // append this character
293	++us;
294	--len;
295	}
296	}
297
298	void text_t::appendcstr (const char *s)
299	{
300	size_t len = strlen(s);
301	if (text.capacity() < (text.size() + len + 2)) {
302	text.reserve(text.size() + len + 2);
303	}
304
305	unsigned char us = (unsigned char )s;
306	while (*us != '\0')
307	{
308	text.push_back (*us); // append this character
309	++us;
310	}
311	}
312
313
314	// strings returned from getcarr and getcstr become the callers
315	// responsibility and should be deallocated with "delete []"
316
317	char *text_t::getcarr(size_type &len) const
318	{
319	unsigned char *cstr = new unsigned char[size()];
320	len = 0;
321
322	const_iterator ithere = begin();
323	const_iterator itend = end();
324	while (ithere != itend)
325	{
326	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
327	else {
328	// put a space or a question mark depending on what
329	// the character is. Question marks tell the user that
330	// they are missing some information.
331	if (is_unicode_space (*ithere)) cstr[len] = ' ';
332	else cstr[len] = '?';
333	}
334	++len;
335	++ithere;
336	}
337
338	return (char *)cstr;
339	}
340
341	char *text_t::getcstr() const
342	{
343	unsigned char *cstr = new unsigned char[size() + 1];
344	const_iterator ithere = begin();
345	const_iterator itend = end();
346	int len = 0;
347
348	while (ithere != itend)
349	{
350	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
351	else {
352	// put a space or a question mark depending on what
353	// the character is. Question marks tell the user that
354	// they are missing some information.
355	if (is_unicode_space (*ithere)) cstr[len] = ' ';
356	else cstr[len] = '?';
357	}
358	++len;
359	++ithere;
360	}
361
362	cstr[len] = '\0';
363
364	return (char *)cstr;
365	}
366
367
368	// general functions which work on text_ts
369
370	// find a character within a range
371	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
372	unsigned short c)
373	{
374	while (first != last)
375	{
376	if (*first == c) break;
377	++first;
378	}
379	return first;
380	}
381
382	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
383	unsigned short c)
384	{
385	while (first != last)
386	{
387	if (*first == c) break;
388	++first;
389	}
390	return first;
391	}
392
393	text_t::iterator findword (text_t::iterator first,
394	text_t::iterator last,
395	const text_t& word)
396	{
397	text_t::const_iterator word_begin = word.begin();
398	text_t::const_iterator word_end = word.end();
399
400	while (first != last)
401	{
402	text_t::iterator char_match = first;
403	text_t::const_iterator word_here = word_begin;
404	while (word_here!=word_end)
405	{
406	if (char_match != word_here)
407	{
408	break;
409	}
410	++char_match;
411	++word_here;
412	}
413	if (word_here==word_end)
414	{
415	return first;
416	}
417	++first;
418	}
419	return last; // get to here only if there is no match
420	}
421
422	// get a string up to the next delimiter (which is skipped)
423	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
424	text_t::const_iterator last,
425	unsigned short c, text_t &outstr)
426	{
427	text_t::const_iterator here = first;
428	here = findchar (first, last, c);
429	outstr.clear();
430	outstr.appendrange (first, here);
431	if (here != last) ++here; // skip c
432	return here;
433	}
434
435	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
436	unsigned short c, text_t &outstr)
437	{
438	text_t::iterator here = first;
439	here = findchar (first, last, c);
440	outstr.clear();
441	outstr.appendrange (first, here);
442	if (here != last) ++here; // skip c
443	return here;
444	}
445
446	// split a string with a character
447	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
448	unsigned short c, text_tset &outlist)
449	{
450	outlist.erase(outlist.begin(), outlist.end());
451
452	text_t t;
453
454	while (first != last)
455	{
456	first = getdelimitstr (first, last, c, t);
457	outlist.insert (t);
458	}
459	}
460
461	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
462	unsigned short c, text_tlist &outlist)
463	{
464	outlist.erase(outlist.begin(), outlist.end());
465
466	text_t t;
467
468	while (first != last)
469	{
470	first = getdelimitstr (first, last, c, t);
471	outlist.push_back (t);
472	}
473	}
474
475	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
476	unsigned short c, text_tarray &outlist)
477	{
478	outlist.erase(outlist.begin(), outlist.end());
479
480	text_t t;
481
482	while (first != last)
483	{
484	first = getdelimitstr (first, last, c, t);
485	outlist.push_back (t);
486	}
487	}
488
489	// join a string using a character
490	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
491	{
492	outtext.clear ();
493
494	text_tset::const_iterator here = inlist.begin ();
495	text_tset::const_iterator end = inlist.end ();
496
497	if (here != end) {
498	outtext += *here; ++here;
499	while (here != end) {
500	outtext.push_back (c);
501	outtext += *here;
502	++here;
503	}
504	}
505	}
506
507	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
508	{
509	outtext.clear ();
510
511	text_tlist::const_iterator here = inlist.begin ();
512	text_tlist::const_iterator end = inlist.end ();
513	if (here != end) {
514	outtext += *here; ++here;
515	while (here != end) {
516	outtext.push_back (c);
517	outtext += *here;
518	++here;
519	}
520	}
521	}
522
523	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
524	{
525	outtext.clear ();
526
527	text_tarray::const_iterator here = inlist.begin ();
528	text_tarray::const_iterator end = inlist.end ();
529	if (here != end) {
530	outtext += *here; ++here;
531	while (here != end) {
532	outtext.push_back (c);
533	outtext += *here;
534	++here;
535	}
536	}
537	}
538
539	void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
540	{
541	outtext.clear ();
542
543	text_tlist::const_iterator here = inlist.begin ();
544	text_tlist::const_iterator end = inlist.end ();
545	if (here != end) {
546	outtext += *here; ++here;
547	while (here != end) {
548	outtext += c;
549	outtext += *here;
550	++here;
551	}
552	}
553	}
554
555	void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
556	{
557	outtext.clear ();
558
559	text_tset::const_iterator here = inlist.begin ();
560	text_tset::const_iterator end = inlist.end ();
561	if (here != end) {
562	outtext += *here; ++here;
563	while (here != end) {
564	outtext += c;
565	outtext += *here;
566	++here;
567	}
568	}
569	}
570
571	void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
572	{
573	outtext.clear ();
574
575	text_tarray::const_iterator here = inlist.begin ();
576	text_tarray::const_iterator end = inlist.end ();
577	if (here != end) {
578	outtext += *here; ++here;
579	while (here != end) {
580	outtext += c;
581	outtext += *here;
582	++here;
583	}
584	}
585	}
586
587	// count the occurances of a character within a range
588	int countchar (text_t::const_iterator first, text_t::const_iterator last,
589	unsigned short c)
590	{
591	int count = 0;
592	while (first != last) {
593	if (*first == c) count ++;
594	first ++;
595	}
596	return count;
597	}
598
599	// return a substring of string from first up to but not including last
600	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
601
602	text_t substr; substr.reserve(last - first + 2);
603	while (first != last) {
604	substr.push_back(*first);
605	++first;
606	}
607	return substr;
608	}
609
610
611	// convert to lowercase
612	void lc (text_t::iterator first, text_t::iterator last) {
613	while (first != last) {
614	first = unicode_tolower(first);
615	++first;
616	}
617	}
618
619	// convert to uppercase
620	void uc (text_t::iterator first, text_t::iterator last) {
621	while (first != last) {
622	first = unicode_toupper(first);
623	++first;
624	}
625	}
626
627
628	// checks to see if it is a number (i.e. contains only 0-9)
629	bool is_number (const text_t &text) {
630
631	text_t::const_iterator here = text.begin();
632	text_t::const_iterator end = text.end();
633
634	while (here != end) {
635	if ((here!='0') && (here!='1') && (*here!='2') &&
636	(here!='3') && (here!='4') && (*here!='5') &&
637	(here!='6') && (here!='7') && (*here!='8') &&
638	(*here!='9')) return false;
639	++here;
640	}
641	return true;
642	}
643
644
645	// checks to see if the text has any letters or digits
646	bool has_unicode_letdig (const text_t &text) {
647	if (text.empty()) return false;
648
649	text_t::const_iterator here = text.begin();
650	text_t::const_iterator end = text.end();
651	while (here != end) {
652	if (is_unicode_letdig (*here)) return true;
653	++here;
654	}
655
656	return false;
657	}
658
659
660
661	////////////////////////////////////
662	// convertclass methods
663	////////////////////////////////////
664
665	// conversion classes used for getting information in to and out of
666	// the text_t class.
667
668	convertclass::convertclass ()
669	{
670	// nothing to do
671	}
672
673	void convertclass::reset ()
674	{
675	// nothing to do
676	}
677
678
679	////////////////////////////////////
680	// inconvertclass methods
681	////////////////////////////////////
682
683	// convert from a char stream to the text_t class
684	// the default version assumes the input is a ascii
685	// character array
686
687	inconvertclass::inconvertclass ()
688	{
689	start = NULL;
690	len = 0;
691	}
692
693
694	void inconvertclass::reset ()
695	{
696	start = NULL;
697	len = 0;
698	}
699
700	void inconvertclass::setinput (char *thestart, size_t thelen)
701	{
702	start = thestart;
703	len = thelen;
704	}
705
706	void inconvertclass::convert (text_t &output, status_t &status)
707	{
708	output.clear();
709
710	if (start == NULL \|\| len == 0)
711	{
712	status = finished;
713	return;
714	}
715
716	if (output.capacity() < len + 2)
717	output.reserve(len + 2);
718
719	// don't want any funny sign conversions happening
720	unsigned char here = (unsigned char )start;
721	while (len > 0)
722	{
723	output.push_back (*here); // append this character
724	++here;
725	--len;
726	}
727
728	start = (char *)here; // save current position
729	status = finished;
730	}
731
732	// will treat the text_t as a 8-bit string and convert
733	// it to a 16-bit string using the about convert method.
734	text_t inconvertclass::convert (const text_t &t) {
735	text_t out;
736	text_t tmpout;
737	status_t status;
738	text_t::const_iterator here = t.begin();
739	text_t::const_iterator end = t.end();
740	unsigned char cbuf[256];
741	size_t cbuflen = 0;
742
743	out.clear();
744	if (out.capacity() < t.size() + 2)
745	out.reserve(t.size() + 2);
746	while (here != end) {
747	while (here != end && cbuflen < 256) {
748	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
749	++here;
750	}
751
752	if (cbuflen > 0) {
753	setinput ((char *)cbuf, cbuflen);
754	status = unfinished;
755	while (status == unfinished) {
756	convert (tmpout, status);
757	out += tmpout;
758	}
759	cbuflen = 0;
760	}
761	}
762
763	out.setencoding (0); // unicode
764
765	return out;
766	}
767
768	// an instance of the default inconvertclass to do simple
769	// conversions. Note that any functions that use this are
770	// not reentrant. If a function needs to be reentrant it
771	// should declare its own instance.
772	inconvertclass ascii2text_t;
773
774
775	////////////////////////////////////
776	// outconvertclass methods
777	////////////////////////////////////
778
779	// Convert from a text_t class to a char stream
780	// This default version assumes the output is a ascii
781	// character array. If you set the output stream you
782	// can use this class to output to a stream using the
783	// << operator. The << operator can also be conveniently
784	// used to set the output stream by doing something like
785	//
786	// cout << text_t2ascii << text_tstr << anothertext_tstr;
787	//
788	outconvertclass::outconvertclass ()
789	{
790	input = NULL;
791	outs = NULL;
792	}
793
794	void outconvertclass::reset ()
795	{
796	input = NULL;
797	outs = NULL;
798	}
799
800	void outconvertclass::setinput (text_t *theinput)
801	{
802	input = theinput;
803	if (input != NULL) texthere = input->begin();
804	}
805
806	void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
807	{
808	input = theinput;
809	texthere = thetexthere;
810	}
811
812	void outconvertclass::convert (char *output, size_t maxlen,
813	size_t &len, status_t &status)
814	{
815	if (input == NULL \|\| output == NULL)
816	{
817	status = finished;
818	return;
819	}
820
821	// don't want any funny sign conversions happening
822	unsigned char uoutput = (unsigned char )output;
823	text_t::iterator textend = input->end();
824	len = 0;
825	while ((len < maxlen) && (texthere != textend))
826	{
827	if (texthere < 256) uoutput = (unsigned char)(*texthere);
828	else {
829	// put a space or a question mark depending on what
830	// the character is. Question marks tell the user that
831	// they are missing some information.
832	if (is_unicode_space (texthere)) uoutput = ' ';
833	else *uoutput = '?';
834	}
835	++uoutput;
836	++len;
837	++texthere;
838	}
839
840	if (texthere == textend) status = finished;
841	else status = unfinished;
842	}
843
844	// will convert the 16-bit string to a 8-bit stream
845	// and place the result in a text_t. This method uses
846	// the above convert function.
847	text_t outconvertclass::convert (const text_t &t) {
848	text_t out;
849	unsigned char cbuf[256];
850	size_t cbuflen = 0;
851	status_t status = unfinished;
852
853	out.clear();
854	if (out.capacity() < t.size() + 2)
855	out.reserve(t.size() + 2);
856	setinput ((text_t *)&t); // discard constant
857	while (status == unfinished) {
858	convert ((char *)cbuf, 256, cbuflen, status);
859	out.appendcarr ((char *)cbuf, cbuflen);
860	}
861
862	out.setencoding (1); // other encoding
863
864	return out;
865	}
866
867
868	void outconvertclass::setostream (ostream *theouts)
869	{
870	outs = theouts;
871	}
872
873	ostream *outconvertclass::getostream ()
874	{
875	return outs;
876	}
877
878
879
880
881	// an instance of the default outconvertclass to do simple
882	// conversions
883	outconvertclass text_t2ascii;
884
885
886
887	// stream operators for the output class
888
889	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
890	{
891	outconverter.setostream(&theouts);
892	return outconverter;
893	}
894
895
896	#define STREAMBUFSIZE 256
897	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
898	{
899	ostream *outstream = outconverter.getostream();
900
901	if (outstream == NULL) return outconverter;
902
903	char outbuf[STREAMBUFSIZE];
904	size_t len;
905	outconvertclass::status_t status = outconvertclass::unfinished;
906
907	// assume that there is no data needing converting
908	// left in the converter
909	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
910
911	while (status == outconvertclass::unfinished)
912	{
913	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
914	if (len > 0) outstream->write(outbuf, len);
915	}
916
917	return outconverter;
918	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: