Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/greenorg/lib/text_t.cpp@ 13640

Last change on this file since 13640 was 5503, checked in by sjboddie, 21 years ago
* empty log message *
Property svn:keywords set to `Author Date Id Revision`
File size: 19.2 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 5503 2003-09-12 04:53:13Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.1 2003/09/12 04:52:19 sjboddie
31	* empty log message *
32
33	Revision 1.21 2001/06/01 02:51:28 sjboddie
34	Changes to get phind working under windows
35
36	Revision 1.20 2001/01/25 18:26:44 cs025
37	Included CORBA branch for first time
38
39	Revision 1.15.2.2 2000/04/05 10:19:38 syeates
40	added automatic conversion to allow text_t's to be <<'ed to ostreams
41
42	Revision 1.15.2.1 2000/04/04 15:02:29 cs025
43	Corba first commit
44
45	Revision 1.15 1999/10/14 22:52:39 sjboddie
46	joinchar can join using text_t string now too
47
48	Revision 1.14 1999/09/24 02:30:03 rjmcnab
49	added function has_unicode_letdig
50
51	Revision 1.13 1999/09/07 04:57:43 sjboddie
52	added gpl notice
53
54	Revision 1.12 1999/08/31 08:04:41 rjmcnab
55	Fixed a small but hard to find bug in getcarr
56
57	Revision 1.11 1999/07/01 04:05:09 rjmcnab
58	Optimised append functions slightly and added a reserve function.
59
60	Revision 1.10 1999/04/26 03:58:03 sjboddie
61	added is_number function
62
63	Revision 1.9 1999/04/06 22:17:24 rjmcnab
64	Added splits and joins using text_tset.
65
66	Revision 1.8 1999/02/28 23:14:41 rjmcnab
67
68	Added uc and lc to convert to uppercase and lowercase.
69
70	Revision 1.7 1999/02/21 22:26:39 rjmcnab
71
72	Made getint() a constant function.
73
74	Revision 1.6 1999/02/03 01:13:26 sjboddie
75
76	Got interface to handle subcollections and language subcollections -
77	committed changes made to some of the collections
78
79	Revision 1.5 1999/01/19 01:38:14 rjmcnab
80
81	Made the source more portable.
82
83	Revision 1.4 1999/01/12 01:51:00 rjmcnab
84
85	Standard header.
86
87	Revision 1.3 1999/01/08 02:33:16 rjmcnab
88
89	Added standard header to source files.
90
91	*/
92
93	#include "text_t.h"
94
95	#if defined(GSDL_USE_OBJECTSPACE)
96	# include <ospace\std\algorithm>
97	#elif defined(GSDL_USE_STL_H)
98	# if defined(GSDL_USE_ALGO_H)
99	# include <algo.h>
100	# else
101	# include <algorithm.h>
102	# endif
103	#else
104	# include <algorithm>
105	#endif
106
107	#ifdef HAVE_CONFIG_H
108	# ifdef __WIN32__
109	# include "WIN32cfg.h"
110	# else
111	# include "config.h"
112	# endif
113	#endif
114
115
116	#include "unitool.h"
117
118	////////////////////////////////////
119	// text_t methods
120	////////////////////////////////////
121
122	// new stream converter ...
123	ostream& operator<< (ostream &o, const text_t text)
124	{
125	text_t::const_iterator ithere = text.begin();
126	text_t::const_iterator itend = text.end();
127
128	while (ithere != itend)
129	{
130	if (*ithere < 256)
131	{
132	o << (unsigned char)(*ithere);
133	}
134	else
135	{
136	// put a space or a question mark depending on what
137	// the character is. Question marks tell the user that
138	// they are missing some information.
139	if (is_unicode_space (*ithere))
140	o << ' ';
141	else
142	o << '?';
143	}
144	ithere++;
145	}
146
147	return o;
148	}
149
150	text_t::text_t ()
151	{
152	setencoding(0);
153	clear ();
154	}
155
156	text_t::text_t (int i)
157	{
158	setencoding(0);
159	clear ();
160	appendint (i);
161	}
162
163	text_t::text_t (char *s)
164	{
165	setencoding(0);
166	clear ();
167	appendcstr (s);
168	}
169
170
171	void text_t::append (const text_t &t)
172	{
173	text.insert(text.end(), t.begin(), t.end());
174	// const_iterator here, end=t.end();
175	// for (here=t.begin(); here!=end;here++)
176	// {
177	// text.push_back(*here);
178	// }
179	}
180
181	void text_t::appendrange (iterator first, iterator last)
182	{
183	text.insert(text.end(), first, last);
184	// while (first != last)
185	// {
186	// text.push_back (*first);
187	// first++;
188	// }
189	}
190
191	void text_t::appendrange (const_iterator first, const_iterator last)
192	{
193	text.insert(text.end(), first, last);
194	// while (first != last)
195	// {
196	// text.push_back (*first);
197	// first++;
198	// }
199	}
200
201	void text_t::appendint (int i)
202	{
203	// deal with zeros and negatives
204	if (i == 0)
205	{
206	text.push_back('0');
207	return;
208	}
209	else if (i < 0)
210	{
211	text.push_back('-');
212	i *= -1;
213	}
214
215	// get a buffer for the conversion
216	int maxbuflen = sizeof(int)*3;
217	char *buf = new char[maxbuflen];
218	int len = 0;
219
220	// get the number in reverse
221	while (i > 0)
222	{
223	buf[len++] = '0'+ (i%10);
224	i = i/10;
225	}
226
227	// reverse the number
228	while (len > 0)
229	{
230	text.push_back(buf[--len]);
231	}
232
233	delete buf;
234	}
235
236	int text_t::getint () const
237	{
238	int i = 0;
239	int mult = 1; // become -1 for negative numbers
240
241	const_iterator here = text.begin();
242	const_iterator end = text.end();
243
244	// do plus and minus signs
245	if (here != end)
246	{
247	if (*here == '-')
248	{
249	mult = -1;
250	here++;
251	}
252	else if (*here == '+')
253	{
254	mult = 1;
255	here++;
256	}
257	}
258
259	// deal with the number
260	while ((here != end) && (here >= '0') && (here <= '9'))
261	{
262	i = 10i + (here - '0');
263	here++;
264	}
265
266	i *= mult;
267	return i;
268	}
269
270	unsigned long text_t::getulong () const
271	{
272	unsigned long i = 0;
273
274	const_iterator here = text.begin();
275	const_iterator end = text.end();
276
277	while ((here != end) && (here >= '0') && (here <= '9'))
278	{
279	i = 10i + (here - '0');
280	here++;
281	}
282
283	return i;
284	}
285
286	void text_t::appendcarr (char *s, size_type len)
287	{
288	unsigned char us = (unsigned char )s;
289	while (len > 0)
290	{
291	text.push_back (*us); // append this character
292	us++;
293	len--;
294	}
295	}
296
297	void text_t::appendcstr (char *s)
298	{
299	unsigned char us = (unsigned char )s;
300	while (*us != '\0')
301	{
302	text.push_back (*us); // append this character
303	us++;
304	}
305	}
306
307
308	// strings returned from getcarr and getcstr become the callers
309	// responsibility and should be deallocated with "delete"
310
311	char *text_t::getcarr(size_type &len) const
312	{
313	unsigned char *cstr = new unsigned char[size()];
314	len = 0;
315
316	const_iterator ithere = begin();
317	const_iterator itend = end();
318	while (ithere != itend)
319	{
320	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
321	else {
322	// put a space or a question mark depending on what
323	// the character is. Question marks tell the user that
324	// they are missing some information.
325	if (is_unicode_space (*ithere)) cstr[len] = ' ';
326	else cstr[len] = '?';
327	}
328	len++;
329	ithere++;
330	}
331
332	return (char *)cstr;
333	}
334
335	char *text_t::getcstr() const
336	{
337	unsigned char *cstr = new unsigned char[size() + 1];
338	const_iterator ithere = begin();
339	const_iterator itend = end();
340	int len = 0;
341
342	while (ithere != itend)
343	{
344	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
345	else {
346	// put a space or a question mark depending on what
347	// the character is. Question marks tell the user that
348	// they are missing some information.
349	if (is_unicode_space (*ithere)) cstr[len] = ' ';
350	else cstr[len] = '?';
351	}
352	len++;
353	ithere++;
354	}
355
356	cstr[len] = '\0';
357
358	return (char *)cstr;
359	}
360
361
362	// general functions which work on text_ts
363
364	// find a character within a range
365	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
366	unsigned short c)
367	{
368	while (first != last)
369	{
370	if (*first == c) break;
371	first++;
372	}
373	return first;
374	}
375
376	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
377	unsigned short c)
378	{
379	while (first != last)
380	{
381	if (*first == c) break;
382	first++;
383	}
384	return first;
385	}
386
387	text_t::iterator findword (text_t::iterator first, text_t::iterator last,
388	const text_t& word)
389	{
390	text_t::const_iterator word_begin = word.begin();
391	text_t::const_iterator word_end = word.end();
392
393	while (first != last)
394	{
395	text_t::iterator char_match = first;
396	text_t::const_iterator word_here = word_begin;
397	while (word_here!=word_end)
398	{
399	if (char_match != word_here)
400	{
401	break;
402	}
403	char_match++;
404	word_here++;
405	}
406	if (word_here==word_end)
407	{
408	return first;
409	}
410	first++;
411	}
412	return last; // get to here only if there is no match
413	}
414
415	// get a string up to the next delimiter (which is skipped)
416	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
417	text_t::const_iterator last,
418	unsigned short c, text_t &outstr)
419	{
420	text_t::const_iterator here = first;
421	here = findchar (first, last, c);
422	outstr.clear();
423	outstr.appendrange (first, here);
424	if (here != last) here++; // skip c
425	return here;
426	}
427
428	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
429	unsigned short c, text_t &outstr)
430	{
431	text_t::iterator here = first;
432	here = findchar (first, last, c);
433	outstr.clear();
434	outstr.appendrange (first, here);
435	if (here != last) here++; // skip c
436	return here;
437	}
438
439	// split a string with a character
440	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
441	unsigned short c, text_tset &outlist)
442	{
443	outlist.erase(outlist.begin(), outlist.end());
444
445	text_t t;
446
447	while (first != last)
448	{
449	first = getdelimitstr (first, last, c, t);
450	outlist.insert (t);
451	}
452	}
453
454	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
455	unsigned short c, text_tlist &outlist)
456	{
457	outlist.erase(outlist.begin(), outlist.end());
458
459	text_t t;
460
461	while (first != last)
462	{
463	first = getdelimitstr (first, last, c, t);
464	outlist.push_back (t);
465	}
466	}
467
468	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
469	unsigned short c, text_tarray &outlist)
470	{
471	outlist.erase(outlist.begin(), outlist.end());
472
473	text_t t;
474
475	while (first != last)
476	{
477	first = getdelimitstr (first, last, c, t);
478	outlist.push_back (t);
479	}
480	}
481
482	// join a string using a character
483	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
484	{
485	outtext.clear ();
486
487	text_tset::const_iterator here = inlist.begin ();
488	text_tset::const_iterator end = inlist.end ();
489	bool first = true;
490	while (here != end)
491	{
492	if (!first) outtext.push_back (c);
493	first = false;
494	outtext += *here;
495	here++;
496	}
497	}
498
499	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
500	{
501	outtext.clear ();
502
503	text_tlist::const_iterator here = inlist.begin ();
504	text_tlist::const_iterator end = inlist.end ();
505	bool first = true;
506	while (here != end)
507	{
508	if (!first) outtext.push_back (c);
509	first = false;
510	outtext += *here;
511	here++;
512	}
513	}
514
515	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
516	{
517	outtext.clear ();
518
519	text_tarray::const_iterator here = inlist.begin ();
520	text_tarray::const_iterator end = inlist.end ();
521	bool first = true;
522	while (here != end)
523	{
524	if (!first) outtext.push_back (c);
525	first = false;
526	outtext += *here;
527	here++;
528	}
529	}
530
531	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
532	{
533	outtext.clear ();
534
535	text_tlist::const_iterator here = inlist.begin ();
536	text_tlist::const_iterator end = inlist.end ();
537	bool first = true;
538	while (here != end)
539	{
540	if (!first) outtext += c;
541	first = false;
542	outtext += *here;
543	here++;
544	}
545	}
546
547	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
548	{
549	outtext.clear ();
550
551	text_tset::const_iterator here = inlist.begin ();
552	text_tset::const_iterator end = inlist.end ();
553	bool first = true;
554	while (here != end)
555	{
556	if (!first) outtext += c;
557	first = false;
558	outtext += *here;
559	here++;
560	}
561	}
562
563	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
564	{
565	outtext.clear ();
566
567	text_tarray::const_iterator here = inlist.begin ();
568	text_tarray::const_iterator end = inlist.end ();
569	bool first = true;
570	while (here != end)
571	{
572	if (!first) outtext += c;
573	first = false;
574	outtext += *here;
575	here++;
576	}
577	}
578
579	// count the occurances of a character within a range
580	int countchar (text_t::const_iterator first, text_t::const_iterator last,
581	unsigned short c)
582	{
583	int count = 0;
584	while (first != last) {
585	if (*first == c) count ++;
586	first ++;
587	}
588	return count;
589	}
590
591	// return a substring of string from first up to but not including last
592	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
593
594	text_t substr;
595	while (first != last) {
596	substr.push_back(*first);
597	first ++;
598	}
599	return substr;
600	}
601
602
603	// convert to lowercase
604	void lc (text_t::iterator first, text_t::iterator last) {
605	while (first != last) {
606	first = unicode_tolower(first);
607	first++;
608	}
609	}
610
611	// convert to uppercase
612	void uc (text_t::iterator first, text_t::iterator last) {
613	while (first != last) {
614	first = unicode_toupper(first);
615	first++;
616	}
617	}
618
619
620	// checks to see if it is a number (i.e. contains only 0-9)
621	bool is_number (const text_t &text) {
622
623	text_t::const_iterator here = text.begin();
624	text_t::const_iterator end = text.end();
625
626	while (here != end) {
627	if ((here!='0') && (here!='1') && (*here!='2') &&
628	(here!='3') && (here!='4') && (*here!='5') &&
629	(here!='6') && (here!='7') && (*here!='8') &&
630	(*here!='9')) return false;
631	here ++;
632	}
633	return true;
634	}
635
636
637	// checks to see if the text has any letters or digits
638	bool has_unicode_letdig (const text_t &text) {
639	if (text.empty()) return false;
640
641	text_t::const_iterator here = text.begin();
642	text_t::const_iterator end = text.end();
643	while (here != end) {
644	if (is_unicode_letdig (*here)) return true;
645	here++;
646	}
647
648	return false;
649	}
650
651
652
653	////////////////////////////////////
654	// convertclass methods
655	////////////////////////////////////
656
657	// conversion classes used for getting information in to and out of
658	// the text_t class.
659
660	convertclass::convertclass ()
661	{
662	// nothing to do
663	}
664
665	void convertclass::reset ()
666	{
667	// nothing to do
668	}
669
670
671	////////////////////////////////////
672	// inconvertclass methods
673	////////////////////////////////////
674
675	// convert from a char stream to the text_t class
676	// the default version assumes the input is a ascii
677	// character array
678
679	inconvertclass::inconvertclass ()
680	{
681	start = NULL;
682	len = 0;
683	}
684
685
686	void inconvertclass::reset ()
687	{
688	start = NULL;
689	len = 0;
690	}
691
692	void inconvertclass::setinput (char *thestart, size_t thelen)
693	{
694	start = thestart;
695	len = thelen;
696	}
697
698	void inconvertclass::convert (text_t &output, status_t &status)
699	{
700	output.clear();
701
702	if (start == NULL \|\| len == 0)
703	{
704	status = finished;
705	return;
706	}
707
708	// don't want any funny sign conversions happening
709	unsigned char here = (unsigned char )start;
710	while (len > 0)
711	{
712	output.push_back (*here); // append this character
713	++here;
714	--len;
715	}
716
717	start = (char *)here; // save current position
718	status = finished;
719	}
720
721	// will treat the text_t as a 8-bit string and convert
722	// it to a 16-bit string using the about convert method.
723	text_t inconvertclass::convert (const text_t &t) {
724	text_t out;
725	text_t tmpout;
726	status_t status;
727	text_t::const_iterator here = t.begin();
728	text_t::const_iterator end = t.end();
729	unsigned char cbuf[256];
730	size_t cbuflen = 0;
731
732	while (here != end) {
733	while (here != end && cbuflen < 256) {
734	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
735	here++;
736	}
737
738	if (cbuflen > 0) {
739	setinput ((char *)cbuf, cbuflen);
740	status = unfinished;
741	while (status == unfinished) {
742	convert (tmpout, status);
743	out += tmpout;
744	}
745	cbuflen = 0;
746	}
747	}
748
749	out.setencoding (0); // unicode
750
751	return out;
752	}
753
754	// an instance of the default inconvertclass to do simple
755	// conversions. Note that any functions that use this are
756	// not reentrant. If a function needs to be reentrant it
757	// should declare its own instance.
758	inconvertclass ascii2text_t;
759
760
761	////////////////////////////////////
762	// outconvertclass methods
763	////////////////////////////////////
764
765	// Convert from a text_t class to a char stream
766	// This default version assumes the output is a ascii
767	// character array. If you set the output stream you
768	// can use this class to output to a stream using the
769	// << operator. The << operator can also be conveniently
770	// used to set the output stream by doing something like
771	//
772	// cout << text_t2ascii << text_tstr << anothertext_tstr;
773	//
774	outconvertclass::outconvertclass ()
775	{
776	input = NULL;
777	outs = NULL;
778	}
779
780	void outconvertclass::reset ()
781	{
782	input = NULL;
783	outs = NULL;
784	}
785
786	void outconvertclass::setinput (text_t *theinput)
787	{
788	input = theinput;
789	if (input != NULL) texthere = input->begin();
790	}
791
792	void outconvertclass::convert (char *output, size_t maxlen,
793	size_t &len, status_t &status)
794	{
795	if (input == NULL \|\| output == NULL)
796	{
797	status = finished;
798	return;
799	}
800
801	// don't want any funny sign conversions happening
802	unsigned char uoutput = (unsigned char )output;
803	text_t::iterator textend = input->end();
804	len = 0;
805	while ((len < maxlen) && (texthere != textend))
806	{
807	if (texthere < 256) uoutput = (unsigned char)(*texthere);
808	else {
809	// put a space or a question mark depending on what
810	// the character is. Question marks tell the user that
811	// they are missing some information.
812	if (is_unicode_space (texthere)) uoutput = ' ';
813	else *uoutput = '?';
814	}
815	++uoutput;
816	++len;
817	++texthere;
818	}
819
820	if (texthere == textend) status = finished;
821	else status = unfinished;
822	}
823
824	// will convert the 16-bit string to a 8-bit stream
825	// and place the result in a text_t. This method uses
826	// the above convert function.
827	text_t outconvertclass::convert (const text_t &t) {
828	text_t out;
829	unsigned char cbuf[256];
830	size_t cbuflen = 0;
831	status_t status = unfinished;
832
833	setinput ((text_t *)&t); // discard constant
834	while (status == unfinished) {
835	convert ((char *)cbuf, 256, cbuflen, status);
836	out.appendcarr ((char *)cbuf, cbuflen);
837	}
838
839	out.setencoding (1); // other encoding
840
841	return out;
842	}
843
844
845	void outconvertclass::setostream (ostream *theouts)
846	{
847	outs = theouts;
848	}
849
850	ostream *outconvertclass::getostream ()
851	{
852	return outs;
853	}
854
855
856
857
858	// an instance of the default outconvertclass to do simple
859	// conversions
860	outconvertclass text_t2ascii;
861
862
863
864	// stream operators for the output class
865
866	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
867	{
868	outconverter.setostream(&theouts);
869	return outconverter;
870	}
871
872
873	#define STREAMBUFSIZE 256
874	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
875	{
876	ostream *outstream = outconverter.getostream();
877
878	if (outstream == NULL) return outconverter;
879
880	char outbuf[STREAMBUFSIZE];
881	size_t len;
882	outconvertclass::status_t status = outconvertclass::unfinished;
883
884	// assume that there is no data needing converting
885	// left in the converter
886	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
887
888	while (status == outconvertclass::unfinished)
889	{
890	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
891	if (len > 0) outstream->write(outbuf, len);
892	}
893
894	return outconverter;
895	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: