Context Navigation

source: main/tags/2.33/gsdl/lib/text_t.cpp@ 23503

Last change on this file since 23503 was 1860, checked in by cs025, 23 years ago
Included CORBA branch for first time
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.8 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 1860 2001-01-25 18:26:45Z cs025 $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.20 2001/01/25 18:26:44 cs025
31	Included CORBA branch for first time
32
33	Revision 1.15.2.2 2000/04/05 10:19:38 syeates
34	added automatic conversion to allow text_t's to be <<'ed to ostreams
35
36	Revision 1.15.2.1 2000/04/04 15:02:29 cs025
37	Corba first commit
38
39	Revision 1.15 1999/10/14 22:52:39 sjboddie
40	joinchar can join using text_t string now too
41
42	Revision 1.14 1999/09/24 02:30:03 rjmcnab
43	added function has_unicode_letdig
44
45	Revision 1.13 1999/09/07 04:57:43 sjboddie
46	added gpl notice
47
48	Revision 1.12 1999/08/31 08:04:41 rjmcnab
49	Fixed a small but hard to find bug in getcarr
50
51	Revision 1.11 1999/07/01 04:05:09 rjmcnab
52	Optimised append functions slightly and added a reserve function.
53
54	Revision 1.10 1999/04/26 03:58:03 sjboddie
55	added is_number function
56
57	Revision 1.9 1999/04/06 22:17:24 rjmcnab
58	Added splits and joins using text_tset.
59
60	Revision 1.8 1999/02/28 23:14:41 rjmcnab
61
62	Added uc and lc to convert to uppercase and lowercase.
63
64	Revision 1.7 1999/02/21 22:26:39 rjmcnab
65
66	Made getint() a constant function.
67
68	Revision 1.6 1999/02/03 01:13:26 sjboddie
69
70	Got interface to handle subcollections and language subcollections -
71	committed changes made to some of the collections
72
73	Revision 1.5 1999/01/19 01:38:14 rjmcnab
74
75	Made the source more portable.
76
77	Revision 1.4 1999/01/12 01:51:00 rjmcnab
78
79	Standard header.
80
81	Revision 1.3 1999/01/08 02:33:16 rjmcnab
82
83	Added standard header to source files.
84
85	*/
86
87	#include "text_t.h"
88
89	#if defined(GSDL_USE_OBJECTSPACE)
90	# include <ospace\std\algorithm>
91	#elif defined(GSDL_USE_STL_H)
92	# if defined(GSDL_USE_ALGO_H)
93	# include <algo.h>
94	# else
95	# include <algorithm.h>
96	# endif
97	#else
98	# include <algorithm>
99	#endif
100
101	#ifdef HAVE_CONFIG_H
102	# ifdef __WIN32__
103	# include "WIN32cfg.h"
104	# else
105	# include "config.h"
106	# endif
107	#endif
108
109
110	#include "unitool.h"
111
112	////////////////////////////////////
113	// text_t methods
114	////////////////////////////////////
115
116	// new stream converter ...
117	ostream& operator<< (ostream &o, const text_t text)
118	{
119	text_t::const_iterator ithere = text.begin();
120	text_t::const_iterator itend = text.end();
121
122	while (ithere != itend)
123	{
124	if (*ithere < 256)
125	{
126	o << (unsigned char)(*ithere);
127	}
128	else
129	{
130	// put a space or a question mark depending on what
131	// the character is. Question marks tell the user that
132	// they are missing some information.
133	if (is_unicode_space (*ithere))
134	o << ' ';
135	else
136	o << '?';
137	}
138	ithere++;
139	}
140
141	return o;
142	}
143
144	text_t::text_t ()
145	{
146	setencoding(0);
147	clear ();
148	}
149
150	text_t::text_t (int i)
151	{
152	setencoding(0);
153	clear ();
154	appendint (i);
155	}
156
157	text_t::text_t (char *s)
158	{
159	setencoding(0);
160	clear ();
161	appendcstr (s);
162	}
163
164
165	void text_t::append (const text_t &t)
166	{
167	text.insert(text.end(), t.begin(), t.end());
168	// const_iterator here, end=t.end();
169	// for (here=t.begin(); here!=end;here++)
170	// {
171	// text.push_back(*here);
172	// }
173	}
174
175	void text_t::appendrange (iterator first, iterator last)
176	{
177	text.insert(text.end(), first, last);
178	// while (first != last)
179	// {
180	// text.push_back (*first);
181	// first++;
182	// }
183	}
184
185	void text_t::appendrange (const_iterator first, const_iterator last)
186	{
187	text.insert(text.end(), first, last);
188	// while (first != last)
189	// {
190	// text.push_back (*first);
191	// first++;
192	// }
193	}
194
195	void text_t::appendint (int i)
196	{
197	// deal with zeros and negatives
198	if (i == 0)
199	{
200	text.push_back('0');
201	return;
202	}
203	else if (i < 0)
204	{
205	text.push_back('-');
206	i *= -1;
207	}
208
209	// get a buffer for the conversion
210	int maxbuflen = sizeof(int)*3;
211	char *buf = new char[maxbuflen];
212	int len = 0;
213
214	// get the number in reverse
215	while (i > 0)
216	{
217	buf[len++] = '0'+ (i%10);
218	i = i/10;
219	}
220
221	// reverse the number
222	while (len > 0)
223	{
224	text.push_back(buf[--len]);
225	}
226
227	delete buf;
228	}
229
230	int text_t::getint () const
231	{
232	int i = 0;
233	int mult = 1; // become -1 for negative numbers
234
235	const_iterator here = text.begin();
236	const_iterator end = text.end();
237
238	// do plus and minus signs
239	if (here != end)
240	{
241	if (*here == '-')
242	{
243	mult = -1;
244	here++;
245	}
246	else if (*here == '+')
247	{
248	mult = 1;
249	here++;
250	}
251	}
252
253	// deal with the number
254	while ((here != end) && (here >= '0') && (here <= '9'))
255	{
256	i = 10i + (here - '0');
257	here++;
258	}
259
260	i *= mult;
261	return i;
262	}
263
264
265
266	void text_t::appendcarr (char *s, size_type len)
267	{
268	unsigned char us = (unsigned char )s;
269	while (len > 0)
270	{
271	text.push_back (*us); // append this character
272	us++;
273	len--;
274	}
275	}
276
277	void text_t::appendcstr (char *s)
278	{
279	unsigned char us = (unsigned char )s;
280	while (*us != '\0')
281	{
282	text.push_back (*us); // append this character
283	us++;
284	}
285	}
286
287
288	// strings returned from getcarr and getcstr become the callers
289	// responsibility and should be deallocated with "delete"
290
291	char *text_t::getcarr(size_type &len) const
292	{
293	unsigned char *cstr = new unsigned char[size()];
294	len = 0;
295
296	const_iterator ithere = begin();
297	const_iterator itend = end();
298	while (ithere != itend)
299	{
300	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
301	else {
302	// put a space or a question mark depending on what
303	// the character is. Question marks tell the user that
304	// they are missing some information.
305	if (is_unicode_space (*ithere)) cstr[len] = ' ';
306	else cstr[len] = '?';
307	}
308	len++;
309	ithere++;
310	}
311
312	return (char *)cstr;
313	}
314
315	char *text_t::getcstr() const
316	{
317	unsigned char *cstr = new unsigned char[size() + 1];
318	const_iterator ithere = begin();
319	const_iterator itend = end();
320	int len = 0;
321
322	while (ithere != itend)
323	{
324	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
325	else {
326	// put a space or a question mark depending on what
327	// the character is. Question marks tell the user that
328	// they are missing some information.
329	if (is_unicode_space (*ithere)) cstr[len] = ' ';
330	else cstr[len] = '?';
331	}
332	len++;
333	ithere++;
334	}
335
336	cstr[len] = '\0';
337
338	return (char *)cstr;
339	}
340
341
342	// general functions which work on text_ts
343
344	// find a character within a range
345	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
346	unsigned short c)
347	{
348	while (first != last)
349	{
350	if (*first == c) break;
351	first++;
352	}
353	return first;
354	}
355
356	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
357	unsigned short c)
358	{
359	while (first != last)
360	{
361	if (*first == c) break;
362	first++;
363	}
364	return first;
365	}
366
367	text_t::iterator findword (text_t::iterator first, text_t::iterator last,
368	const text_t& word)
369	{
370	text_t::const_iterator word_begin = word.begin();
371	text_t::const_iterator word_end = word.end();
372
373	while (first != last)
374	{
375	text_t::iterator char_match = first;
376	text_t::const_iterator word_here = word_begin;
377	while (word_here!=word_end)
378	{
379	if (char_match != word_here)
380	{
381	break;
382	}
383	char_match++;
384	word_here++;
385	}
386	if (word_here==word_end)
387	{
388	return first;
389	}
390	first++;
391	}
392	return last; // get to here only if there is no match
393	}
394
395	// get a string up to the next delimiter (which is skipped)
396	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
397	text_t::const_iterator last,
398	unsigned short c, text_t &outstr)
399	{
400	text_t::const_iterator here = first;
401	here = findchar (first, last, c);
402	outstr.clear();
403	outstr.appendrange (first, here);
404	if (here != last) here++; // skip c
405	return here;
406	}
407
408	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
409	unsigned short c, text_t &outstr)
410	{
411	text_t::iterator here = first;
412	here = findchar (first, last, c);
413	outstr.clear();
414	outstr.appendrange (first, here);
415	if (here != last) here++; // skip c
416	return here;
417	}
418
419	// split a string with a character
420	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
421	unsigned short c, text_tset &outlist)
422	{
423	outlist.erase(outlist.begin(), outlist.end());
424
425	text_t t;
426
427	while (first != last)
428	{
429	first = getdelimitstr (first, last, c, t);
430	outlist.insert (t);
431	}
432	}
433
434	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
435	unsigned short c, text_tlist &outlist)
436	{
437	outlist.erase(outlist.begin(), outlist.end());
438
439	text_t t;
440
441	while (first != last)
442	{
443	first = getdelimitstr (first, last, c, t);
444	outlist.push_back (t);
445	}
446	}
447
448	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
449	unsigned short c, text_tarray &outlist)
450	{
451	outlist.erase(outlist.begin(), outlist.end());
452
453	text_t t;
454
455	while (first != last)
456	{
457	first = getdelimitstr (first, last, c, t);
458	outlist.push_back (t);
459	}
460	}
461
462	// join a string using a character
463	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
464	{
465	outtext.clear ();
466
467	text_tset::const_iterator here = inlist.begin ();
468	text_tset::const_iterator end = inlist.end ();
469	bool first = true;
470	while (here != end)
471	{
472	if (!first) outtext.push_back (c);
473	first = false;
474	outtext += *here;
475	here++;
476	}
477	}
478
479	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
480	{
481	outtext.clear ();
482
483	text_tlist::const_iterator here = inlist.begin ();
484	text_tlist::const_iterator end = inlist.end ();
485	bool first = true;
486	while (here != end)
487	{
488	if (!first) outtext.push_back (c);
489	first = false;
490	outtext += *here;
491	here++;
492	}
493	}
494
495	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
496	{
497	outtext.clear ();
498
499	text_tarray::const_iterator here = inlist.begin ();
500	text_tarray::const_iterator end = inlist.end ();
501	bool first = true;
502	while (here != end)
503	{
504	if (!first) outtext.push_back (c);
505	first = false;
506	outtext += *here;
507	here++;
508	}
509	}
510
511	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
512	{
513	outtext.clear ();
514
515	text_tlist::const_iterator here = inlist.begin ();
516	text_tlist::const_iterator end = inlist.end ();
517	bool first = true;
518	while (here != end)
519	{
520	if (!first) outtext += c;
521	first = false;
522	outtext += *here;
523	here++;
524	}
525	}
526
527	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
528	{
529	outtext.clear ();
530
531	text_tset::const_iterator here = inlist.begin ();
532	text_tset::const_iterator end = inlist.end ();
533	bool first = true;
534	while (here != end)
535	{
536	if (!first) outtext += c;
537	first = false;
538	outtext += *here;
539	here++;
540	}
541	}
542
543	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
544	{
545	outtext.clear ();
546
547	text_tarray::const_iterator here = inlist.begin ();
548	text_tarray::const_iterator end = inlist.end ();
549	bool first = true;
550	while (here != end)
551	{
552	if (!first) outtext += c;
553	first = false;
554	outtext += *here;
555	here++;
556	}
557	}
558
559	// count the occurances of a character within a range
560	int countchar (text_t::const_iterator first, text_t::const_iterator last,
561	unsigned short c)
562	{
563	int count = 0;
564	while (first != last) {
565	if (*first == c) count ++;
566	first ++;
567	}
568	return count;
569	}
570
571	// return a substring of string from first up to but not including last
572	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
573
574	text_t substr;
575	while (first != last) {
576	substr.push_back(*first);
577	first ++;
578	}
579	return substr;
580	}
581
582
583	// convert to lowercase
584	void lc (text_t::iterator first, text_t::iterator last) {
585	while (first != last) {
586	first = unicode_tolower(first);
587	first++;
588	}
589	}
590
591	// convert to uppercase
592	void uc (text_t::iterator first, text_t::iterator last) {
593	while (first != last) {
594	first = unicode_toupper(first);
595	first++;
596	}
597	}
598
599
600	// checks to see if it is a number (i.e. contains only 0-9)
601	bool is_number (const text_t &text) {
602
603	text_t::const_iterator here = text.begin();
604	text_t::const_iterator end = text.end();
605
606	while (here != end) {
607	if ((here!='0') && (here!='1') && (*here!='2') &&
608	(here!='3') && (here!='4') && (*here!='5') &&
609	(here!='6') && (here!='7') && (*here!='8') &&
610	(*here!='9')) return false;
611	here ++;
612	}
613	return true;
614	}
615
616
617	// checks to see if the text has any letters or digits
618	bool has_unicode_letdig (const text_t &text) {
619	if (text.empty()) return false;
620
621	text_t::const_iterator here = text.begin();
622	text_t::const_iterator end = text.end();
623	while (here != end) {
624	if (is_unicode_letdig (*here)) return true;
625	here++;
626	}
627
628	return false;
629	}
630
631
632
633	////////////////////////////////////
634	// convertclass methods
635	////////////////////////////////////
636
637	// conversion classes used for getting information in to and out of
638	// the text_t class.
639
640	convertclass::convertclass ()
641	{
642	// nothing to do
643	}
644
645	void convertclass::reset ()
646	{
647	// nothing to do
648	}
649
650
651	////////////////////////////////////
652	// inconvertclass methods
653	////////////////////////////////////
654
655	// convert from a char stream to the text_t class
656	// the default version assumes the input is a ascii
657	// character array
658
659	inconvertclass::inconvertclass ()
660	{
661	start = NULL;
662	len = 0;
663	}
664
665
666	void inconvertclass::reset ()
667	{
668	start = NULL;
669	len = 0;
670	}
671
672	void inconvertclass::setinput (char *thestart, size_t thelen)
673	{
674	start = thestart;
675	len = thelen;
676	}
677
678	void inconvertclass::convert (text_t &output, status_t &status)
679	{
680	output.clear();
681
682	if (start == NULL \|\| len == 0)
683	{
684	status = finished;
685	return;
686	}
687
688	// don't want any funny sign conversions happening
689	unsigned char here = (unsigned char )start;
690	while (len > 0)
691	{
692	output.push_back (*here); // append this character
693	++here;
694	--len;
695	}
696
697	start = (char *)here; // save current position
698	status = finished;
699	}
700
701	// will treat the text_t as a 8-bit string and convert
702	// it to a 16-bit string using the about convert method.
703	text_t inconvertclass::convert (const text_t &t) {
704	text_t out;
705	text_t tmpout;
706	status_t status;
707	text_t::const_iterator here = t.begin();
708	text_t::const_iterator end = t.end();
709	unsigned char cbuf[256];
710	size_t cbuflen = 0;
711
712	while (here != end) {
713	while (here != end && cbuflen < 256) {
714	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
715	here++;
716	}
717
718	if (cbuflen > 0) {
719	setinput ((char *)cbuf, cbuflen);
720	status = unfinished;
721	while (status == unfinished) {
722	convert (tmpout, status);
723	out += tmpout;
724	}
725	cbuflen = 0;
726	}
727	}
728
729	out.setencoding (0); // unicode
730
731	return out;
732	}
733
734	// an instance of the default inconvertclass to do simple
735	// conversions. Note that any functions that use this are
736	// not reentrant. If a function needs to be reentrant it
737	// should declare its own instance.
738	inconvertclass ascii2text_t;
739
740
741	////////////////////////////////////
742	// outconvertclass methods
743	////////////////////////////////////
744
745	// Convert from a text_t class to a char stream
746	// This default version assumes the output is a ascii
747	// character array. If you set the output stream you
748	// can use this class to output to a stream using the
749	// << operator. The << operator can also be conveniently
750	// used to set the output stream by doing something like
751	//
752	// cout << text_t2ascii << text_tstr << anothertext_tstr;
753	//
754	outconvertclass::outconvertclass ()
755	{
756	input = NULL;
757	outs = NULL;
758	}
759
760	void outconvertclass::reset ()
761	{
762	input = NULL;
763	outs = NULL;
764	}
765
766	void outconvertclass::setinput (text_t *theinput)
767	{
768	input = theinput;
769	if (input != NULL) texthere = input->begin();
770	}
771
772	void outconvertclass::convert (char *output, size_t maxlen,
773	size_t &len, status_t &status)
774	{
775	if (input == NULL \|\| output == NULL)
776	{
777	status = finished;
778	return;
779	}
780
781	// don't want any funny sign conversions happening
782	unsigned char uoutput = (unsigned char )output;
783	text_t::iterator textend = input->end();
784	len = 0;
785	while ((len < maxlen) && (texthere != textend))
786	{
787	if (texthere < 256) uoutput = (unsigned char)(*texthere);
788	else {
789	// put a space or a question mark depending on what
790	// the character is. Question marks tell the user that
791	// they are missing some information.
792	if (is_unicode_space (texthere)) uoutput = ' ';
793	else *uoutput = '?';
794	}
795	++uoutput;
796	++len;
797	++texthere;
798	}
799
800	if (texthere == textend) status = finished;
801	else status = unfinished;
802	}
803
804	// will convert the 16-bit string to a 8-bit stream
805	// and place the result in a text_t. This method uses
806	// the above convert function.
807	text_t outconvertclass::convert (const text_t &t) {
808	text_t out;
809	unsigned char cbuf[256];
810	size_t cbuflen = 0;
811	status_t status = unfinished;
812
813	setinput ((text_t *)&t); // discard constant
814	while (status == unfinished) {
815	convert ((char *)cbuf, 256, cbuflen, status);
816	out.appendcarr ((char *)cbuf, cbuflen);
817	}
818
819	out.setencoding (1); // other encoding
820
821	return out;
822	}
823
824
825	void outconvertclass::setostream (ostream *theouts)
826	{
827	outs = theouts;
828	}
829
830	ostream *outconvertclass::getostream ()
831	{
832	return outs;
833	}
834
835
836
837
838	// an instance of the default outconvertclass to do simple
839	// conversions
840	outconvertclass text_t2ascii;
841
842
843
844	// stream operators for the output class
845
846	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
847	{
848	outconverter.setostream(&theouts);
849	return outconverter;
850	}
851
852
853	#define STREAMBUFSIZE 256
854	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
855	{
856	ostream *outstream = outconverter.getostream();
857
858	if (outstream == NULL) return outconverter;
859
860	char outbuf[STREAMBUFSIZE];
861	size_t len;
862	outconvertclass::status_t status = outconvertclass::unfinished;
863
864	// assume that there is no data needing converting
865	// left in the converter
866	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
867
868	while (status == outconvertclass::unfinished)
869	{
870	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
871	if (len > 0) outstream->write(outbuf, len);
872	}
873
874	return outconverter;
875	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: