Context Navigation

source: trunk/gsdl/lib/text_t.cpp@ 595

Last change on this file since 595 was 534, checked in by sjboddie, 25 years ago
added gpl notice
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.8 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 534 1999-09-07 04:57:43Z sjboddie $
25	*
26	*********************************************************************/
27
28	/*
29	$Log$
30	Revision 1.13 1999/09/07 04:57:43 sjboddie
31	added gpl notice
32
33	Revision 1.12 1999/08/31 08:04:41 rjmcnab
34	Fixed a small but hard to find bug in getcarr
35
36	Revision 1.11 1999/07/01 04:05:09 rjmcnab
37	Optimised append functions slightly and added a reserve function.
38
39	Revision 1.10 1999/04/26 03:58:03 sjboddie
40	added is_number function
41
42	Revision 1.9 1999/04/06 22:17:24 rjmcnab
43	Added splits and joins using text_tset.
44
45	Revision 1.8 1999/02/28 23:14:41 rjmcnab
46
47	Added uc and lc to convert to uppercase and lowercase.
48
49	Revision 1.7 1999/02/21 22:26:39 rjmcnab
50
51	Made getint() a constant function.
52
53	Revision 1.6 1999/02/03 01:13:26 sjboddie
54
55	Got interface to handle subcollections and language subcollections -
56	committed changes made to some of the collections
57
58	Revision 1.5 1999/01/19 01:38:14 rjmcnab
59
60	Made the source more portable.
61
62	Revision 1.4 1999/01/12 01:51:00 rjmcnab
63
64	Standard header.
65
66	Revision 1.3 1999/01/08 02:33:16 rjmcnab
67
68	Added standard header to source files.
69
70	*/
71
72
73	#include "text_t.h"
74
75	#if defined(GSDL_USE_OBJECTSPACE)
76	# include <ospace\std\algorithm>
77	#elif defined(GSDL_USE_STL_H)
78	# if defined(GSDL_USE_ALGO_H)
79	# include <algo.h>
80	# else
81	# include <algorithm.h>
82	# endif
83	#else
84	# include <algorithm>
85	#endif
86
87
88	#include "unitool.h"
89
90	////////////////////////////////////
91	// text_t methods
92	////////////////////////////////////
93
94	text_t::text_t ()
95	{
96	setencoding(0);
97	clear ();
98	}
99
100	text_t::text_t (int i)
101	{
102	setencoding(0);
103	clear ();
104	appendint (i);
105	}
106
107	text_t::text_t (char *s)
108	{
109	setencoding(0);
110	clear ();
111	appendcstr (s);
112	}
113
114	void text_t::append (const text_t &t)
115	{
116	text.insert(text.end(), t.begin(), t.end());
117	// const_iterator here, end=t.end();
118	// for (here=t.begin(); here!=end;here++)
119	// {
120	// text.push_back(*here);
121	// }
122	}
123
124	void text_t::appendrange (iterator first, iterator last)
125	{
126	text.insert(text.end(), first, last);
127	// while (first != last)
128	// {
129	// text.push_back (*first);
130	// first++;
131	// }
132	}
133
134	void text_t::appendrange (const_iterator first, const_iterator last)
135	{
136	text.insert(text.end(), first, last);
137	// while (first != last)
138	// {
139	// text.push_back (*first);
140	// first++;
141	// }
142	}
143
144	void text_t::appendint (int i)
145	{
146	// deal with zeros and negatives
147	if (i == 0)
148	{
149	text.push_back('0');
150	return;
151	}
152	else if (i < 0)
153	{
154	text.push_back('-');
155	i *= -1;
156	}
157
158	// get a buffer for the conversion
159	int maxbuflen = sizeof(int)*3;
160	char *buf = new char[maxbuflen];
161	int len = 0;
162
163	// get the number in reverse
164	while (i > 0)
165	{
166	buf[len++] = '0'+ (i%10);
167	i = i/10;
168	}
169
170	// reverse the number
171	while (len > 0)
172	{
173	text.push_back(buf[--len]);
174	}
175
176	delete buf;
177	}
178
179	int text_t::getint () const
180	{
181	int i = 0;
182	int mult = 1; // become -1 for negative numbers
183
184	const_iterator here = text.begin();
185	const_iterator end = text.end();
186
187	// do plus and minus signs
188	if (here != end)
189	{
190	if (*here == '-')
191	{
192	mult = -1;
193	here++;
194	}
195	else if (*here == '+')
196	{
197	mult = 1;
198	here++;
199	}
200	}
201
202	// deal with the number
203	while ((here != end) && (here >= '0') && (here <= '9'))
204	{
205	i = 10i + (here - '0');
206	here++;
207	}
208
209	i *= mult;
210	return i;
211	}
212
213
214
215	void text_t::appendcarr (char *s, size_type len)
216	{
217	unsigned char us = (unsigned char )s;
218	while (len > 0)
219	{
220	text.push_back (*us); // append this character
221	us++;
222	len--;
223	}
224	}
225
226	void text_t::appendcstr (char *s)
227	{
228	unsigned char us = (unsigned char )s;
229	while (*us != '\0')
230	{
231	text.push_back (*us); // append this character
232	us++;
233	}
234	}
235
236
237	// strings returned from getcarr and getcstr become the callers
238	// responsibility and should be deallocated with "delete"
239
240	char *text_t::getcarr(size_type &len) const
241	{
242	unsigned char *cstr = new unsigned char[size()];
243	len = 0;
244
245	const_iterator ithere = begin();
246	const_iterator itend = end();
247	while (ithere != itend)
248	{
249	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
250	else {
251	// put a space or a question mark depending on what
252	// the character is. Question marks tell the user that
253	// they are missing some information.
254	if (is_unicode_space (*ithere)) cstr[len] = ' ';
255	else cstr[len] = '?';
256	}
257	len++;
258	ithere++;
259	}
260
261	return (char *)cstr;
262	}
263
264	char *text_t::getcstr() const
265	{
266	unsigned char *cstr = new unsigned char[size() + 1];
267	const_iterator ithere = begin();
268	const_iterator itend = end();
269	int len = 0;
270
271	while (ithere != itend)
272	{
273	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
274	else {
275	// put a space or a question mark depending on what
276	// the character is. Question marks tell the user that
277	// they are missing some information.
278	if (is_unicode_space (*ithere)) cstr[len] = ' ';
279	else cstr[len] = '?';
280	}
281	len++;
282	ithere++;
283	}
284
285	cstr[len] = '\0';
286
287	return (char *)cstr;
288	}
289
290
291	// general functions which work on text_ts
292
293	// find a character within a range
294	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
295	unsigned short c)
296	{
297	while (first != last)
298	{
299	if (*first == c) break;
300	first++;
301	}
302	return first;
303	}
304
305	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
306	unsigned short c)
307	{
308	while (first != last)
309	{
310	if (*first == c) break;
311	first++;
312	}
313	return first;
314	}
315
316	// get a string up to the next delimiter (which is skipped)
317	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
318	text_t::const_iterator last,
319	unsigned short c, text_t &outstr)
320	{
321	text_t::const_iterator here = first;
322	here = findchar (first, last, c);
323	outstr.clear();
324	outstr.appendrange (first, here);
325	if (here != last) here++; // skip c
326	return here;
327	}
328
329	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
330	unsigned short c, text_t &outstr)
331	{
332	text_t::iterator here = first;
333	here = findchar (first, last, c);
334	outstr.clear();
335	outstr.appendrange (first, here);
336	if (here != last) here++; // skip c
337	return here;
338	}
339
340	// split a string with a character
341	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
342	unsigned short c, text_tset &outlist)
343	{
344	outlist.erase(outlist.begin(), outlist.end());
345
346	text_t t;
347
348	while (first != last)
349	{
350	first = getdelimitstr (first, last, c, t);
351	outlist.insert (t);
352	}
353	}
354
355	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
356	unsigned short c, text_tlist &outlist)
357	{
358	outlist.erase(outlist.begin(), outlist.end());
359
360	text_t t;
361
362	while (first != last)
363	{
364	first = getdelimitstr (first, last, c, t);
365	outlist.push_back (t);
366	}
367	}
368
369	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
370	unsigned short c, text_tarray &outlist)
371	{
372	outlist.erase(outlist.begin(), outlist.end());
373
374	text_t t;
375
376	while (first != last)
377	{
378	first = getdelimitstr (first, last, c, t);
379	outlist.push_back (t);
380	}
381	}
382
383	// join a string using a character
384	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
385	{
386	outtext.clear ();
387
388	text_tset::const_iterator here = inlist.begin ();
389	text_tset::const_iterator end = inlist.end ();
390	bool first = true;
391	while (here != end)
392	{
393	if (!first) outtext.push_back (c);
394	first = false;
395	outtext += *here;
396	here++;
397	}
398	}
399
400	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
401	{
402	outtext.clear ();
403
404	text_tlist::const_iterator here = inlist.begin ();
405	text_tlist::const_iterator end = inlist.end ();
406	bool first = true;
407	while (here != end)
408	{
409	if (!first) outtext.push_back (c);
410	first = false;
411	outtext += *here;
412	here++;
413	}
414	}
415
416	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
417	{
418	outtext.clear ();
419
420	text_tarray::const_iterator here = inlist.begin ();
421	text_tarray::const_iterator end = inlist.end ();
422	bool first = true;
423	while (here != end)
424	{
425	if (!first) outtext.push_back (c);
426	first = false;
427	outtext += *here;
428	here++;
429	}
430	}
431
432	// count the occurances of a character within a range
433	int countchar (text_t::const_iterator first, text_t::const_iterator last,
434	unsigned short c)
435	{
436	int count = 0;
437	while (first != last) {
438	if (*first == c) count ++;
439	first ++;
440	}
441	return count;
442	}
443
444	// return a substring of string from first up to but not including last
445	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
446
447	text_t substr;
448	while (first != last) {
449	substr.push_back(*first);
450	first ++;
451	}
452	return substr;
453	}
454
455
456	// convert to lowercase
457	void lc (text_t::iterator first, text_t::iterator last) {
458	while (first != last) {
459	first = unicode_tolower(first);
460	first++;
461	}
462	}
463
464	// convert to uppercase
465	void uc (text_t::iterator first, text_t::iterator last) {
466	while (first != last) {
467	first = unicode_toupper(first);
468	first++;
469	}
470	}
471
472
473	// checks to see if it is a number (i.e. contains only 0-9)
474	bool is_number (const text_t &text) {
475
476	text_t::const_iterator here = text.begin();
477	text_t::const_iterator end = text.end();
478
479	while (here != end) {
480	if ((here!='0') && (here!='1') && (*here!='2') &&
481	(here!='3') && (here!='4') && (*here!='5') &&
482	(here!='6') && (here!='7') && (*here!='8') &&
483	(*here!='9')) return false;
484	here ++;
485	}
486	return true;
487	}
488
489
490
491	////////////////////////////////////
492	// convertclass methods
493	////////////////////////////////////
494
495	// conversion classes used for getting information in to and out of
496	// the text_t class.
497
498	convertclass::convertclass ()
499	{
500	// nothing to do
501	}
502
503	void convertclass::reset ()
504	{
505	// nothing to do
506	}
507
508
509	////////////////////////////////////
510	// inconvertclass methods
511	////////////////////////////////////
512
513	// convert from a char stream to the text_t class
514	// the default version assumes the input is a ascii
515	// character array
516
517	inconvertclass::inconvertclass ()
518	{
519	start = NULL;
520	len = 0;
521	}
522
523
524	void inconvertclass::reset ()
525	{
526	start = NULL;
527	len = 0;
528	}
529
530	void inconvertclass::setinput (char *thestart, size_t thelen)
531	{
532	start = thestart;
533	len = thelen;
534	}
535
536	void inconvertclass::convert (text_t &output, status_t &status)
537	{
538	output.clear();
539
540	if (start == NULL \|\| len == 0)
541	{
542	status = finished;
543	return;
544	}
545
546	// don't want any funny sign conversions happening
547	unsigned char here = (unsigned char )start;
548	while (len > 0)
549	{
550	output.push_back (*here); // append this character
551	++here;
552	--len;
553	}
554
555	start = (char *)here; // save current position
556	status = finished;
557	}
558
559	// will treat the text_t as a 8-bit string and convert
560	// it to a 16-bit string using the about convert method.
561	text_t inconvertclass::convert (const text_t &t) {
562	text_t out;
563	text_t tmpout;
564	status_t status;
565	text_t::const_iterator here = t.begin();
566	text_t::const_iterator end = t.end();
567	unsigned char cbuf[256];
568	size_t cbuflen = 0;
569
570	while (here != end) {
571	while (here != end && cbuflen < 256) {
572	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
573	here++;
574	}
575
576	if (cbuflen > 0) {
577	setinput ((char *)cbuf, cbuflen);
578	status = unfinished;
579	while (status == unfinished) {
580	convert (tmpout, status);
581	out += tmpout;
582	}
583	cbuflen = 0;
584	}
585	}
586
587	out.setencoding (0); // unicode
588
589	return out;
590	}
591
592	// an instance of the default inconvertclass to do simple
593	// conversions. Note that any functions that use this are
594	// not reentrant. If a function needs to be reentrant it
595	// should declare its own instance.
596	inconvertclass ascii2text_t;
597
598
599	////////////////////////////////////
600	// outconvertclass methods
601	////////////////////////////////////
602
603	// Convert from a text_t class to a char stream
604	// This default version assumes the output is a ascii
605	// character array. If you set the output stream you
606	// can use this class to output to a stream using the
607	// << operator. The << operator can also be conveniently
608	// used to set the output stream by doing something like
609	//
610	// cout << text_t2ascii << text_tstr << anothertext_tstr;
611	//
612	outconvertclass::outconvertclass ()
613	{
614	input = NULL;
615	outs = NULL;
616	}
617
618	void outconvertclass::reset ()
619	{
620	input = NULL;
621	outs = NULL;
622	}
623
624	void outconvertclass::setinput (text_t *theinput)
625	{
626	input = theinput;
627	if (input != NULL) texthere = input->begin();
628	}
629
630	void outconvertclass::convert (char *output, size_t maxlen,
631	size_t &len, status_t &status)
632	{
633	if (input == NULL \|\| output == NULL)
634	{
635	status = finished;
636	return;
637	}
638
639	// don't want any funny sign conversions happening
640	unsigned char uoutput = (unsigned char )output;
641	text_t::iterator textend = input->end();
642	len = 0;
643	while ((len < maxlen) && (texthere != textend))
644	{
645	if (texthere < 256) uoutput = (unsigned char)(*texthere);
646	else {
647	// put a space or a question mark depending on what
648	// the character is. Question marks tell the user that
649	// they are missing some information.
650	if (is_unicode_space (texthere)) uoutput = ' ';
651	else *uoutput = '?';
652	}
653	++uoutput;
654	++len;
655	++texthere;
656	}
657
658	if (texthere == textend) status = finished;
659	else status = unfinished;
660	}
661
662	// will convert the 16-bit string to a 8-bit stream
663	// and place the result in a text_t. This method uses
664	// the above convert function.
665	text_t outconvertclass::convert (const text_t &t) {
666	text_t out;
667	unsigned char cbuf[256];
668	size_t cbuflen = 0;
669	status_t status = unfinished;
670
671	setinput ((text_t *)&t); // discard constant
672	while (status == unfinished) {
673	convert ((char *)cbuf, 256, cbuflen, status);
674	out.appendcarr ((char *)cbuf, cbuflen);
675	}
676
677	out.setencoding (1); // other encoding
678
679	return out;
680	}
681
682
683	void outconvertclass::setostream (ostream *theouts)
684	{
685	outs = theouts;
686	}
687
688	ostream *outconvertclass::getostream ()
689	{
690	return outs;
691	}
692
693
694
695
696	// an instance of the default outconvertclass to do simple
697	// conversions
698	outconvertclass text_t2ascii;
699
700
701
702	// stream operators for the output class
703
704	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
705	{
706	outconverter.setostream(&theouts);
707	return outconverter;
708	}
709
710
711	#define STREAMBUFSIZE 256
712	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
713	{
714	ostream *outstream = outconverter.getostream();
715
716	if (outstream == NULL) return outconverter;
717
718	char outbuf[STREAMBUFSIZE];
719	size_t len;
720	outconvertclass::status_t status = outconvertclass::unfinished;
721
722	// assume that there is no data needing converting
723	// left in the converter
724	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
725
726	while (status == outconvertclass::unfinished)
727	{
728	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
729	if (len > 0) outstream->write(outbuf, len);
730	}
731
732	return outconverter;
733	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: