Context Navigation

source: main/tags/2.25/gsdl/lib/text_t.cpp@ 23902

Last change on this file since 23902 was 1310, checked in by sjboddie, 24 years ago
Removed CVS logging information from source files
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.0 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26	#include "text_t.h"
27
28	#if defined(GSDL_USE_OBJECTSPACE)
29	# include <ospace\std\algorithm>
30	#elif defined(GSDL_USE_STL_H)
31	# if defined(GSDL_USE_ALGO_H)
32	# include <algo.h>
33	# else
34	# include <algorithm.h>
35	# endif
36	#else
37	# include <algorithm>
38	#endif
39
40
41	#include "unitool.h"
42
43	////////////////////////////////////
44	// text_t methods
45	////////////////////////////////////
46
47	text_t::text_t ()
48	{
49	setencoding(0);
50	clear ();
51	}
52
53	text_t::text_t (int i)
54	{
55	setencoding(0);
56	clear ();
57	appendint (i);
58	}
59
60	text_t::text_t (char *s)
61	{
62	setencoding(0);
63	clear ();
64	appendcstr (s);
65	}
66
67	void text_t::append (const text_t &t)
68	{
69	text.insert(text.end(), t.begin(), t.end());
70	// const_iterator here, end=t.end();
71	// for (here=t.begin(); here!=end;here++)
72	// {
73	// text.push_back(*here);
74	// }
75	}
76
77	void text_t::appendrange (iterator first, iterator last)
78	{
79	text.insert(text.end(), first, last);
80	// while (first != last)
81	// {
82	// text.push_back (*first);
83	// first++;
84	// }
85	}
86
87	void text_t::appendrange (const_iterator first, const_iterator last)
88	{
89	text.insert(text.end(), first, last);
90	// while (first != last)
91	// {
92	// text.push_back (*first);
93	// first++;
94	// }
95	}
96
97	void text_t::appendint (int i)
98	{
99	// deal with zeros and negatives
100	if (i == 0)
101	{
102	text.push_back('0');
103	return;
104	}
105	else if (i < 0)
106	{
107	text.push_back('-');
108	i *= -1;
109	}
110
111	// get a buffer for the conversion
112	int maxbuflen = sizeof(int)*3;
113	char *buf = new char[maxbuflen];
114	int len = 0;
115
116	// get the number in reverse
117	while (i > 0)
118	{
119	buf[len++] = '0'+ (i%10);
120	i = i/10;
121	}
122
123	// reverse the number
124	while (len > 0)
125	{
126	text.push_back(buf[--len]);
127	}
128
129	delete buf;
130	}
131
132	int text_t::getint () const
133	{
134	int i = 0;
135	int mult = 1; // become -1 for negative numbers
136
137	const_iterator here = text.begin();
138	const_iterator end = text.end();
139
140	// do plus and minus signs
141	if (here != end)
142	{
143	if (*here == '-')
144	{
145	mult = -1;
146	here++;
147	}
148	else if (*here == '+')
149	{
150	mult = 1;
151	here++;
152	}
153	}
154
155	// deal with the number
156	while ((here != end) && (here >= '0') && (here <= '9'))
157	{
158	i = 10i + (here - '0');
159	here++;
160	}
161
162	i *= mult;
163	return i;
164	}
165
166
167
168	void text_t::appendcarr (char *s, size_type len)
169	{
170	unsigned char us = (unsigned char )s;
171	while (len > 0)
172	{
173	text.push_back (*us); // append this character
174	us++;
175	len--;
176	}
177	}
178
179	void text_t::appendcstr (char *s)
180	{
181	unsigned char us = (unsigned char )s;
182	while (*us != '\0')
183	{
184	text.push_back (*us); // append this character
185	us++;
186	}
187	}
188
189
190	// strings returned from getcarr and getcstr become the callers
191	// responsibility and should be deallocated with "delete"
192
193	char *text_t::getcarr(size_type &len) const
194	{
195	unsigned char *cstr = new unsigned char[size()];
196	len = 0;
197
198	const_iterator ithere = begin();
199	const_iterator itend = end();
200	while (ithere != itend)
201	{
202	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
203	else {
204	// put a space or a question mark depending on what
205	// the character is. Question marks tell the user that
206	// they are missing some information.
207	if (is_unicode_space (*ithere)) cstr[len] = ' ';
208	else cstr[len] = '?';
209	}
210	len++;
211	ithere++;
212	}
213
214	return (char *)cstr;
215	}
216
217	char *text_t::getcstr() const
218	{
219	unsigned char *cstr = new unsigned char[size() + 1];
220	const_iterator ithere = begin();
221	const_iterator itend = end();
222	int len = 0;
223
224	while (ithere != itend)
225	{
226	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
227	else {
228	// put a space or a question mark depending on what
229	// the character is. Question marks tell the user that
230	// they are missing some information.
231	if (is_unicode_space (*ithere)) cstr[len] = ' ';
232	else cstr[len] = '?';
233	}
234	len++;
235	ithere++;
236	}
237
238	cstr[len] = '\0';
239
240	return (char *)cstr;
241	}
242
243
244	// general functions which work on text_ts
245
246	// find a character within a range
247	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
248	unsigned short c)
249	{
250	while (first != last)
251	{
252	if (*first == c) break;
253	first++;
254	}
255	return first;
256	}
257
258	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
259	unsigned short c)
260	{
261	while (first != last)
262	{
263	if (*first == c) break;
264	first++;
265	}
266	return first;
267	}
268
269	// get a string up to the next delimiter (which is skipped)
270	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
271	text_t::const_iterator last,
272	unsigned short c, text_t &outstr)
273	{
274	text_t::const_iterator here = first;
275	here = findchar (first, last, c);
276	outstr.clear();
277	outstr.appendrange (first, here);
278	if (here != last) here++; // skip c
279	return here;
280	}
281
282	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
283	unsigned short c, text_t &outstr)
284	{
285	text_t::iterator here = first;
286	here = findchar (first, last, c);
287	outstr.clear();
288	outstr.appendrange (first, here);
289	if (here != last) here++; // skip c
290	return here;
291	}
292
293	// split a string with a character
294	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
295	unsigned short c, text_tset &outlist)
296	{
297	outlist.erase(outlist.begin(), outlist.end());
298
299	text_t t;
300
301	while (first != last)
302	{
303	first = getdelimitstr (first, last, c, t);
304	outlist.insert (t);
305	}
306	}
307
308	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
309	unsigned short c, text_tlist &outlist)
310	{
311	outlist.erase(outlist.begin(), outlist.end());
312
313	text_t t;
314
315	while (first != last)
316	{
317	first = getdelimitstr (first, last, c, t);
318	outlist.push_back (t);
319	}
320	}
321
322	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
323	unsigned short c, text_tarray &outlist)
324	{
325	outlist.erase(outlist.begin(), outlist.end());
326
327	text_t t;
328
329	while (first != last)
330	{
331	first = getdelimitstr (first, last, c, t);
332	outlist.push_back (t);
333	}
334	}
335
336	// join a string using a character
337	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
338	{
339	outtext.clear ();
340
341	text_tset::const_iterator here = inlist.begin ();
342	text_tset::const_iterator end = inlist.end ();
343	bool first = true;
344	while (here != end)
345	{
346	if (!first) outtext.push_back (c);
347	first = false;
348	outtext += *here;
349	here++;
350	}
351	}
352
353	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
354	{
355	outtext.clear ();
356
357	text_tlist::const_iterator here = inlist.begin ();
358	text_tlist::const_iterator end = inlist.end ();
359	bool first = true;
360	while (here != end)
361	{
362	if (!first) outtext.push_back (c);
363	first = false;
364	outtext += *here;
365	here++;
366	}
367	}
368
369	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
370	{
371	outtext.clear ();
372
373	text_tarray::const_iterator here = inlist.begin ();
374	text_tarray::const_iterator end = inlist.end ();
375	bool first = true;
376	while (here != end)
377	{
378	if (!first) outtext.push_back (c);
379	first = false;
380	outtext += *here;
381	here++;
382	}
383	}
384
385	void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
386	{
387	outtext.clear ();
388
389	text_tlist::const_iterator here = inlist.begin ();
390	text_tlist::const_iterator end = inlist.end ();
391	bool first = true;
392	while (here != end)
393	{
394	if (!first) outtext += c;
395	first = false;
396	outtext += *here;
397	here++;
398	}
399	}
400
401	void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
402	{
403	outtext.clear ();
404
405	text_tset::const_iterator here = inlist.begin ();
406	text_tset::const_iterator end = inlist.end ();
407	bool first = true;
408	while (here != end)
409	{
410	if (!first) outtext += c;
411	first = false;
412	outtext += *here;
413	here++;
414	}
415	}
416
417	void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
418	{
419	outtext.clear ();
420
421	text_tarray::const_iterator here = inlist.begin ();
422	text_tarray::const_iterator end = inlist.end ();
423	bool first = true;
424	while (here != end)
425	{
426	if (!first) outtext += c;
427	first = false;
428	outtext += *here;
429	here++;
430	}
431	}
432
433	// count the occurances of a character within a range
434	int countchar (text_t::const_iterator first, text_t::const_iterator last,
435	unsigned short c)
436	{
437	int count = 0;
438	while (first != last) {
439	if (*first == c) count ++;
440	first ++;
441	}
442	return count;
443	}
444
445	// return a substring of string from first up to but not including last
446	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
447
448	text_t substr;
449	while (first != last) {
450	substr.push_back(*first);
451	first ++;
452	}
453	return substr;
454	}
455
456
457	// convert to lowercase
458	void lc (text_t::iterator first, text_t::iterator last) {
459	while (first != last) {
460	first = unicode_tolower(first);
461	first++;
462	}
463	}
464
465	// convert to uppercase
466	void uc (text_t::iterator first, text_t::iterator last) {
467	while (first != last) {
468	first = unicode_toupper(first);
469	first++;
470	}
471	}
472
473
474	// checks to see if it is a number (i.e. contains only 0-9)
475	bool is_number (const text_t &text) {
476
477	text_t::const_iterator here = text.begin();
478	text_t::const_iterator end = text.end();
479
480	while (here != end) {
481	if ((here!='0') && (here!='1') && (*here!='2') &&
482	(here!='3') && (here!='4') && (*here!='5') &&
483	(here!='6') && (here!='7') && (*here!='8') &&
484	(*here!='9')) return false;
485	here ++;
486	}
487	return true;
488	}
489
490
491	// checks to see if the text has any letters or digits
492	bool has_unicode_letdig (const text_t &text) {
493	if (text.empty()) return false;
494
495	text_t::const_iterator here = text.begin();
496	text_t::const_iterator end = text.end();
497	while (here != end) {
498	if (is_unicode_letdig (*here)) return true;
499	here++;
500	}
501
502	return false;
503	}
504
505
506
507	////////////////////////////////////
508	// convertclass methods
509	////////////////////////////////////
510
511	// conversion classes used for getting information in to and out of
512	// the text_t class.
513
514	convertclass::convertclass ()
515	{
516	// nothing to do
517	}
518
519	void convertclass::reset ()
520	{
521	// nothing to do
522	}
523
524
525	////////////////////////////////////
526	// inconvertclass methods
527	////////////////////////////////////
528
529	// convert from a char stream to the text_t class
530	// the default version assumes the input is a ascii
531	// character array
532
533	inconvertclass::inconvertclass ()
534	{
535	start = NULL;
536	len = 0;
537	}
538
539
540	void inconvertclass::reset ()
541	{
542	start = NULL;
543	len = 0;
544	}
545
546	void inconvertclass::setinput (char *thestart, size_t thelen)
547	{
548	start = thestart;
549	len = thelen;
550	}
551
552	void inconvertclass::convert (text_t &output, status_t &status)
553	{
554	output.clear();
555
556	if (start == NULL \|\| len == 0)
557	{
558	status = finished;
559	return;
560	}
561
562	// don't want any funny sign conversions happening
563	unsigned char here = (unsigned char )start;
564	while (len > 0)
565	{
566	output.push_back (*here); // append this character
567	++here;
568	--len;
569	}
570
571	start = (char *)here; // save current position
572	status = finished;
573	}
574
575	// will treat the text_t as a 8-bit string and convert
576	// it to a 16-bit string using the about convert method.
577	text_t inconvertclass::convert (const text_t &t) {
578	text_t out;
579	text_t tmpout;
580	status_t status;
581	text_t::const_iterator here = t.begin();
582	text_t::const_iterator end = t.end();
583	unsigned char cbuf[256];
584	size_t cbuflen = 0;
585
586	while (here != end) {
587	while (here != end && cbuflen < 256) {
588	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
589	here++;
590	}
591
592	if (cbuflen > 0) {
593	setinput ((char *)cbuf, cbuflen);
594	status = unfinished;
595	while (status == unfinished) {
596	convert (tmpout, status);
597	out += tmpout;
598	}
599	cbuflen = 0;
600	}
601	}
602
603	out.setencoding (0); // unicode
604
605	return out;
606	}
607
608	// an instance of the default inconvertclass to do simple
609	// conversions. Note that any functions that use this are
610	// not reentrant. If a function needs to be reentrant it
611	// should declare its own instance.
612	inconvertclass ascii2text_t;
613
614
615	////////////////////////////////////
616	// outconvertclass methods
617	////////////////////////////////////
618
619	// Convert from a text_t class to a char stream
620	// This default version assumes the output is a ascii
621	// character array. If you set the output stream you
622	// can use this class to output to a stream using the
623	// << operator. The << operator can also be conveniently
624	// used to set the output stream by doing something like
625	//
626	// cout << text_t2ascii << text_tstr << anothertext_tstr;
627	//
628	outconvertclass::outconvertclass ()
629	{
630	input = NULL;
631	outs = NULL;
632	}
633
634	void outconvertclass::reset ()
635	{
636	input = NULL;
637	outs = NULL;
638	}
639
640	void outconvertclass::setinput (text_t *theinput)
641	{
642	input = theinput;
643	if (input != NULL) texthere = input->begin();
644	}
645
646	void outconvertclass::convert (char *output, size_t maxlen,
647	size_t &len, status_t &status)
648	{
649	if (input == NULL \|\| output == NULL)
650	{
651	status = finished;
652	return;
653	}
654
655	// don't want any funny sign conversions happening
656	unsigned char uoutput = (unsigned char )output;
657	text_t::iterator textend = input->end();
658	len = 0;
659	while ((len < maxlen) && (texthere != textend))
660	{
661	if (texthere < 256) uoutput = (unsigned char)(*texthere);
662	else {
663	// put a space or a question mark depending on what
664	// the character is. Question marks tell the user that
665	// they are missing some information.
666	if (is_unicode_space (texthere)) uoutput = ' ';
667	else *uoutput = '?';
668	}
669	++uoutput;
670	++len;
671	++texthere;
672	}
673
674	if (texthere == textend) status = finished;
675	else status = unfinished;
676	}
677
678	// will convert the 16-bit string to a 8-bit stream
679	// and place the result in a text_t. This method uses
680	// the above convert function.
681	text_t outconvertclass::convert (const text_t &t) {
682	text_t out;
683	unsigned char cbuf[256];
684	size_t cbuflen = 0;
685	status_t status = unfinished;
686
687	setinput ((text_t *)&t); // discard constant
688	while (status == unfinished) {
689	convert ((char *)cbuf, 256, cbuflen, status);
690	out.appendcarr ((char *)cbuf, cbuflen);
691	}
692
693	out.setencoding (1); // other encoding
694
695	return out;
696	}
697
698
699	void outconvertclass::setostream (ostream *theouts)
700	{
701	outs = theouts;
702	}
703
704	ostream *outconvertclass::getostream ()
705	{
706	return outs;
707	}
708
709
710
711
712	// an instance of the default outconvertclass to do simple
713	// conversions
714	outconvertclass text_t2ascii;
715
716
717
718	// stream operators for the output class
719
720	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
721	{
722	outconverter.setostream(&theouts);
723	return outconverter;
724	}
725
726
727	#define STREAMBUFSIZE 256
728	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
729	{
730	ostream *outstream = outconverter.getostream();
731
732	if (outstream == NULL) return outconverter;
733
734	char outbuf[STREAMBUFSIZE];
735	size_t len;
736	outconvertclass::status_t status = outconvertclass::unfinished;
737
738	// assume that there is no data needing converting
739	// left in the converter
740	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
741
742	while (status == outconvertclass::unfinished)
743	{
744	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
745	if (len > 0) outstream->write(outbuf, len);
746	}
747
748	return outconverter;
749	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: