Context Navigation

source: gsdl/trunk/lib/text_t.cpp@ 14909

Last change on this file since 14909 was 14909, checked in by davidb, 16 years ago
Standardisation of Windows config file to lowercase (included from this source file). Was causing a problem when trying to compile on Unix filesystem mounted under Windows.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.8 KB

Line
1	/**********************************************************************
2	*
3	* text_t.cpp -- a simple 16-bit character string class
4	* Copyright (C) 1999 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	* $Id: text_t.cpp 14909 2007-12-11 20:39:26Z davidb $
25	*
26	*********************************************************************/
27
28	#include "text_t.h"
29
30	#if defined(GSDL_USE_OBJECTSPACE)
31	# include <ospace\std\algorithm>
32	#elif defined(GSDL_USE_STL_H)
33	# if defined(GSDL_USE_ALGO_H)
34	# include <algo.h>
35	# else
36	# include <algorithm.h>
37	# endif
38	#else
39	# include <algorithm>
40	#endif
41
42	#ifdef HAVE_CONFIG_H
43	# ifdef __WIN32__
44	# include "win32cfg.h"
45	# else
46	# include "config.h"
47	# endif
48	#endif
49
50
51	#include "unitool.h"
52
53	const text_t g_EmptyText("");
54
55	////////////////////////////////////
56	// text_t methods
57	////////////////////////////////////
58
59	// new stream converter ...
60	ostream& operator<< (ostream &o, const text_t &text)
61	{
62	text_t::const_iterator ithere = text.begin();
63	text_t::const_iterator itend = text.end();
64
65	while (ithere != itend)
66	{
67	if (*ithere < 256)
68	{
69	o << (unsigned char)(*ithere);
70	}
71	else
72	{
73	// put a space or a question mark depending on what
74	// the character is. Question marks tell the user that
75	// they are missing some information.
76	if (is_unicode_space (*ithere))
77	o << ' ';
78	else
79	o << '?';
80	}
81	++ithere;
82	}
83
84	return o;
85	}
86
87	text_t::text_t ()
88	{
89	setencoding(0);
90	clear ();
91	}
92
93	text_t::text_t (int i)
94	{
95	setencoding(0);
96	clear ();
97	appendint (i);
98	}
99
100	text_t::text_t (const char *s)
101	{
102	setencoding(0);
103	clear ();
104	appendcstr (s);
105	}
106
107	text_t::text_t (const char *s, size_type nLength)
108	{
109	setencoding(0);
110	clear ();
111	appendcarr(s, nLength);
112	}
113
114
115	void text_t::append (const text_t &t)
116	{
117	text.insert(text.end(), t.begin(), t.end());
118	}
119
120	void text_t::appendrange (iterator first, iterator last)
121	{
122	text.insert(text.end(), first, last);
123	}
124
125	void text_t::appendrange (const_iterator first, const_iterator last)
126	{
127	text.insert(text.end(), first, last);
128	}
129
130	void text_t::appendint (int i)
131	{
132	// deal with zeros and negatives
133	if (i == 0)
134	{
135	text.push_back('0');
136	return;
137	}
138	else if (i < 0)
139	{
140	text.push_back('-');
141	i *= -1;
142	}
143
144	// get a buffer for the conversion
145	int maxbuflen = sizeof(int)*3;
146	char *buf = new char[maxbuflen];
147	int len = 0;
148
149	// get the number in reverse
150	while (i > 0)
151	{
152	buf[len++] = '0'+ (i%10);
153	i = i/10;
154	}
155
156	// reverse the number
157	while (len > 0)
158	{
159	text.push_back(buf[--len]);
160	}
161
162	delete []buf;
163	}
164
165	int text_t::getint () const
166	{
167	int i = 0;
168	int mult = 1; // become -1 for negative numbers
169
170	const_iterator here = text.begin();
171	const_iterator end = text.end();
172
173	// do plus and minus signs
174	if (here != end)
175	{
176	if (*here == '-')
177	{
178	mult = -1;
179	++here;
180	}
181	else if (*here == '+')
182	{
183	mult = 1;
184	++here;
185	}
186	}
187
188	// deal with the number
189	while ((here != end) && (here >= '0') && (here <= '9'))
190	{
191	i = 10i + (here - '0');
192	++here;
193	}
194
195	i *= mult;
196	return i;
197	}
198
199	unsigned long text_t::getulong () const
200	{
201	unsigned long i = 0;
202
203	const_iterator here = text.begin();
204	const_iterator end = text.end();
205
206	while ((here != end) && (here >= '0') && (here <= '9'))
207	{
208	i = 10i + (here - '0');
209	++here;
210	}
211
212	return i;
213	}
214
215	void text_t::appendcarr (const char *s, size_type len)
216	{
217	unsigned char us = (unsigned char )s;
218	if (text.capacity() < (text.size() + len + 2)) {
219	text.reserve(text.size() + len + 2);
220	}
221
222	while (len > 0)
223	{
224	text.push_back (*us); // append this character
225	++us;
226	--len;
227	}
228	}
229
230	void text_t::appendcstr (const char *s)
231	{
232	size_t len = strlen(s);
233	if (text.capacity() < (text.size() + len + 2)) {
234	text.reserve(text.size() + len + 2);
235	}
236
237	unsigned char us = (unsigned char )s;
238	while (*us != '\0')
239	{
240	text.push_back (*us); // append this character
241	++us;
242	}
243	}
244
245
246	// strings returned from getcarr and getcstr become the callers
247	// responsibility and should be deallocated with "delete []"
248
249	char *text_t::getcarr(size_type &len) const
250	{
251	unsigned char *cstr = new unsigned char[size()];
252	len = 0;
253
254	const_iterator ithere = begin();
255	const_iterator itend = end();
256	while (ithere != itend)
257	{
258	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
259	else {
260	// put a space or a question mark depending on what
261	// the character is. Question marks tell the user that
262	// they are missing some information.
263	if (is_unicode_space (*ithere)) cstr[len] = ' ';
264	else cstr[len] = '?';
265	}
266	++len;
267	++ithere;
268	}
269
270	return (char *)cstr;
271	}
272
273	char *text_t::getcstr() const
274	{
275	unsigned char *cstr = new unsigned char[size() + 1];
276	const_iterator ithere = begin();
277	const_iterator itend = end();
278	int len = 0;
279
280	while (ithere != itend)
281	{
282	if (ithere < 256) cstr[len] = (unsigned char)(ithere);
283	else {
284	// put a space or a question mark depending on what
285	// the character is. Question marks tell the user that
286	// they are missing some information.
287	if (is_unicode_space (*ithere)) cstr[len] = ' ';
288	else cstr[len] = '?';
289	}
290	++len;
291	++ithere;
292	}
293
294	cstr[len] = '\0';
295
296	return (char *)cstr;
297	}
298
299
300	int text_t::replace(text_t toreplace, text_t replacement)
301	{
302	// Get the beginning and end of the current text
303	text_t::iterator text_begin = text.begin(), text_end = text.end();
304	int count = 0;
305	text_t new_text, temp_text;
306
307	// Loop through and grab the text off the end
308	while (text_begin < text_end)
309	{
310	// Find where the next toreplace is
311	text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
312
313	// Grab the string up to it
314	temp_text = substr(text_begin, next_toreplace);
315
316	// Add the new string onto the end
317	if (new_text.empty())
318	{
319	new_text.append(temp_text);
320	}
321	else
322	{
323	new_text.append(replacement + temp_text);
324	}
325
326	// Finally, we need to move the current pointer up to the new position
327	text_begin = next_toreplace + 1;
328	count++;
329	}
330
331	text.clear();
332	text = new_text.text_as_usvector();
333	return count;
334	}
335
336
337	// general functions which work on text_ts
338
339	// find a character within a range
340	text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
341	unsigned short c)
342	{
343	while (first != last)
344	{
345	if (*first == c) break;
346	++first;
347	}
348	return first;
349	}
350
351	text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
352	unsigned short c)
353	{
354	while (first != last)
355	{
356	if (*first == c) break;
357	++first;
358	}
359	return first;
360	}
361
362	text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last,
363	unsigned short c)
364	{
365	text_t::iterator current = last;
366	while (current != first) {
367	if (*current == c) break;
368	--current;
369	}
370	if (current == first) {
371	if (*current == c) return current;
372	return last;
373	}
374
375	return current;
376	}
377
378	text_t::iterator findword (text_t::iterator first,
379	text_t::iterator last,
380	const text_t& word)
381	{
382	text_t::const_iterator word_begin = word.begin();
383	text_t::const_iterator word_end = word.end();
384
385	while (first != last)
386	{
387	text_t::iterator char_match = first;
388	text_t::const_iterator word_here = word_begin;
389	while (word_here!=word_end)
390	{
391	if (char_match != word_here)
392	{
393	break;
394	}
395	++char_match;
396	++word_here;
397	}
398	if (word_here==word_end)
399	{
400	return first;
401	}
402	++first;
403	}
404	return last; // get to here only if there is no match
405	}
406
407	// get a string up to the next delimiter (which is skipped)
408	text_t::const_iterator getdelimitstr (text_t::const_iterator first,
409	text_t::const_iterator last,
410	unsigned short c, text_t &outstr)
411	{
412	text_t::const_iterator here = first;
413	here = findchar (first, last, c);
414	outstr.clear();
415	outstr.appendrange (first, here);
416	if (here != last) ++here; // skip c
417	return here;
418	}
419
420	text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
421	unsigned short c, text_t &outstr)
422	{
423	text_t::iterator here = first;
424	here = findchar (first, last, c);
425	outstr.clear();
426	outstr.appendrange (first, here);
427	if (here != last) ++here; // skip c
428	return here;
429	}
430
431	// split a string with a character
432	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
433	unsigned short c, text_tset &outlist)
434	{
435	outlist.erase(outlist.begin(), outlist.end());
436
437	text_t t;
438
439	while (first != last)
440	{
441	first = getdelimitstr (first, last, c, t);
442	outlist.insert (t);
443	}
444	}
445
446	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
447	unsigned short c, text_tlist &outlist)
448	{
449	outlist.erase(outlist.begin(), outlist.end());
450
451	text_t t;
452
453	while (first != last)
454	{
455	first = getdelimitstr (first, last, c, t);
456	outlist.push_back (t);
457	}
458	}
459
460	void splitchar (text_t::const_iterator first, text_t::const_iterator last,
461	unsigned short c, text_tarray &outlist)
462	{
463	outlist.erase(outlist.begin(), outlist.end());
464
465	text_t t;
466
467	while (first != last)
468	{
469	first = getdelimitstr (first, last, c, t);
470	outlist.push_back (t);
471	}
472	}
473
474	// join a string using a character
475	void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
476	{
477	outtext.clear ();
478
479	text_tset::const_iterator here = inlist.begin ();
480	text_tset::const_iterator end = inlist.end ();
481
482	if (here != end) {
483	outtext += *here; ++here;
484	while (here != end) {
485	outtext.push_back (c);
486	outtext += *here;
487	++here;
488	}
489	}
490	}
491
492	void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
493	{
494	outtext.clear ();
495
496	text_tlist::const_iterator here = inlist.begin ();
497	text_tlist::const_iterator end = inlist.end ();
498	if (here != end) {
499	outtext += *here; ++here;
500	while (here != end) {
501	outtext.push_back (c);
502	outtext += *here;
503	++here;
504	}
505	}
506	}
507
508	void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
509	{
510	outtext.clear ();
511
512	text_tarray::const_iterator here = inlist.begin ();
513	text_tarray::const_iterator end = inlist.end ();
514	if (here != end) {
515	outtext += *here; ++here;
516	while (here != end) {
517	outtext.push_back (c);
518	outtext += *here;
519	++here;
520	}
521	}
522	}
523
524	void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
525	{
526	outtext.clear ();
527
528	text_tlist::const_iterator here = inlist.begin ();
529	text_tlist::const_iterator end = inlist.end ();
530	if (here != end) {
531	outtext += *here; ++here;
532	while (here != end) {
533	outtext += c;
534	outtext += *here;
535	++here;
536	}
537	}
538	}
539
540	void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
541	{
542	outtext.clear ();
543
544	text_tset::const_iterator here = inlist.begin ();
545	text_tset::const_iterator end = inlist.end ();
546	if (here != end) {
547	outtext += *here; ++here;
548	while (here != end) {
549	outtext += c;
550	outtext += *here;
551	++here;
552	}
553	}
554	}
555
556	void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
557	{
558	outtext.clear ();
559
560	text_tarray::const_iterator here = inlist.begin ();
561	text_tarray::const_iterator end = inlist.end ();
562	if (here != end) {
563	outtext += *here; ++here;
564	while (here != end) {
565	outtext += c;
566	outtext += *here;
567	++here;
568	}
569	}
570	}
571
572	// count the occurances of a character within a range
573	int countchar (text_t::const_iterator first, text_t::const_iterator last,
574	unsigned short c)
575	{
576	int count = 0;
577	while (first != last) {
578	if (*first == c) ++count;
579	++first;
580	}
581	return count;
582	}
583
584	// return a substring of string from first up to but not including last
585	text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
586
587	text_t substr; substr.reserve(last - first + 2);
588	while (first != last) {
589	substr.push_back(*first);
590	++first;
591	}
592	return substr;
593	}
594
595
596	// convert to lowercase
597	void lc (text_t::iterator first, text_t::iterator last) {
598	while (first != last) {
599	first = unicode_tolower(first);
600	++first;
601	}
602	}
603
604	// convert to uppercase
605	void uc (text_t::iterator first, text_t::iterator last) {
606	while (first != last) {
607	first = unicode_toupper(first);
608	++first;
609	}
610	}
611
612
613	// checks to see if it is a number (i.e. contains only 0-9)
614	bool is_number (const text_t &text) {
615
616	text_t::const_iterator here = text.begin();
617	text_t::const_iterator end = text.end();
618
619	while (here != end) {
620	if ((here!='0') && (here!='1') && (*here!='2') &&
621	(here!='3') && (here!='4') && (*here!='5') &&
622	(here!='6') && (here!='7') && (*here!='8') &&
623	(*here!='9')) return false;
624	++here;
625	}
626	return true;
627	}
628
629
630	// checks to see if the text has any letters or digits
631	bool has_unicode_letdig (const text_t &text) {
632	if (text.empty()) return false;
633
634	text_t::const_iterator here = text.begin();
635	text_t::const_iterator end = text.end();
636	while (here != end) {
637	if (is_unicode_letdig (*here)) return true;
638	++here;
639	}
640
641	return false;
642	}
643
644	// checks to see if a text_t starts with the specified prefix
645	bool starts_with(const text_t& text, const text_t& prefix) {
646	if (prefix.empty()) return true;
647	if (text.empty() \|\| text.size()<prefix.size()) return false;
648	text_t substring = substr(text.begin(), text.begin()+prefix.size());
649	return substring == prefix;
650	}
651	// checks to see if a text_t ends with the specified suffix
652	bool ends_with(const text_t& text, const text_t& suffix) {
653	if (suffix.empty()) return true;
654	if (text.empty() \|\| text.size() < suffix.size()) return false;
655	text_t substring = substr(text.end()-suffix.size(),text.end());
656	return substring == suffix;
657
658	}
659
660
661	////////////////////////////////////
662	// convertclass methods
663	////////////////////////////////////
664
665	// conversion classes used for getting information in to and out of
666	// the text_t class.
667
668	convertclass::convertclass ()
669	{
670	// nothing to do
671	}
672
673	void convertclass::reset ()
674	{
675	// nothing to do
676	}
677
678
679	////////////////////////////////////
680	// inconvertclass methods
681	////////////////////////////////////
682
683	// convert from a char stream to the text_t class
684	// the default version assumes the input is a ascii
685	// character array
686
687	inconvertclass::inconvertclass ()
688	{
689	start = NULL;
690	len = 0;
691	}
692
693
694	void inconvertclass::reset ()
695	{
696	start = NULL;
697	len = 0;
698	}
699
700	void inconvertclass::setinput (char *thestart, size_t thelen)
701	{
702	start = thestart;
703	len = thelen;
704	}
705
706	void inconvertclass::convert (text_t &output, status_t &status)
707	{
708	output.clear();
709
710	if (start == NULL \|\| len == 0)
711	{
712	status = finished;
713	return;
714	}
715
716	if (output.capacity() < len + 2)
717	output.reserve(len + 2);
718
719	// don't want any funny sign conversions happening
720	unsigned char here = (unsigned char )start;
721	while (len > 0)
722	{
723	output.push_back (*here); // append this character
724	++here;
725	--len;
726	}
727
728	start = (char *)here; // save current position
729	status = finished;
730	}
731
732	// will treat the text_t as a 8-bit string and convert
733	// it to a 16-bit string using the about convert method.
734	text_t inconvertclass::convert (const text_t &t) {
735	text_t out;
736	text_t tmpout;
737	status_t status;
738	text_t::const_iterator here = t.begin();
739	text_t::const_iterator end = t.end();
740	unsigned char cbuf[256];
741	size_t cbuflen = 0;
742
743	out.clear();
744	if (out.capacity() < t.size() + 2)
745	out.reserve(t.size() + 2);
746	while (here != end) {
747	while (here != end && cbuflen < 256) {
748	cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
749	++here;
750	}
751
752	if (cbuflen > 0) {
753	setinput ((char *)cbuf, cbuflen);
754	status = unfinished;
755	while (status == unfinished) {
756	convert (tmpout, status);
757	out += tmpout;
758	}
759	cbuflen = 0;
760	}
761	}
762
763	out.setencoding (0); // unicode
764
765	return out;
766	}
767
768	// an instance of the default inconvertclass to do simple
769	// conversions. Note that any functions that use this are
770	// not reentrant. If a function needs to be reentrant it
771	// should declare its own instance.
772	inconvertclass ascii2text_t;
773
774
775	////////////////////////////////////
776	// outconvertclass methods
777	////////////////////////////////////
778
779	// Convert from a text_t class to a char stream
780	// This default version assumes the output is a ascii
781	// character array. If you set the output stream you
782	// can use this class to output to a stream using the
783	// << operator. The << operator can also be conveniently
784	// used to set the output stream by doing something like
785	//
786	// cout << text_t2ascii << text_tstr << anothertext_tstr;
787	//
788	outconvertclass::outconvertclass ()
789	{
790	input = NULL;
791	outs = NULL;
792	}
793
794	void outconvertclass::reset ()
795	{
796	input = NULL;
797	outs = NULL;
798	}
799
800	void outconvertclass::setinput (text_t *theinput)
801	{
802	input = theinput;
803	if (input != NULL) texthere = input->begin();
804	}
805
806	void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
807	{
808	input = theinput;
809	texthere = thetexthere;
810	}
811
812	void outconvertclass::convert (char *output, size_t maxlen,
813	size_t &len, status_t &status)
814	{
815	if (input == NULL \|\| output == NULL)
816	{
817	status = finished;
818	return;
819	}
820
821	// don't want any funny sign conversions happening
822	unsigned char uoutput = (unsigned char )output;
823	text_t::iterator textend = input->end();
824	len = 0;
825	while ((len < maxlen) && (texthere != textend))
826	{
827	if (texthere < 256) uoutput = (unsigned char)(*texthere);
828	else {
829	// put a space or a question mark depending on what
830	// the character is. Question marks tell the user that
831	// they are missing some information.
832	if (is_unicode_space (texthere)) uoutput = ' ';
833	else *uoutput = '?';
834	}
835	++uoutput;
836	++len;
837	++texthere;
838	}
839
840	if (texthere == textend) status = finished;
841	else status = unfinished;
842	}
843
844	// will convert the 16-bit string to a 8-bit stream
845	// and place the result in a text_t. This method uses
846	// the above convert function.
847	text_t outconvertclass::convert (const text_t &t) {
848	text_t out;
849	unsigned char cbuf[256];
850	size_t cbuflen = 0;
851	status_t status = unfinished;
852
853	out.clear();
854	if (out.capacity() < t.size() + 2)
855	out.reserve(t.size() + 2);
856	setinput ((text_t *)&t); // discard constant
857	while (status == unfinished) {
858	convert ((char *)cbuf, 256, cbuflen, status);
859	out.appendcarr ((char *)cbuf, cbuflen);
860	}
861
862	out.setencoding (1); // other encoding
863
864	return out;
865	}
866
867
868	void outconvertclass::setostream (ostream *theouts)
869	{
870	outs = theouts;
871	}
872
873	ostream *outconvertclass::getostream ()
874	{
875	return outs;
876	}
877
878
879
880
881	// an instance of the default outconvertclass to do simple
882	// conversions
883	outconvertclass text_t2ascii;
884
885
886
887	// stream operators for the output class
888
889	outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
890	{
891	outconverter.setostream(&theouts);
892	return outconverter;
893	}
894
895
896	#define STREAMBUFSIZE 256
897	outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
898	{
899	ostream *outstream = outconverter.getostream();
900
901	if (outstream == NULL) return outconverter;
902
903	char outbuf[STREAMBUFSIZE];
904	size_t len;
905	outconvertclass::status_t status = outconvertclass::unfinished;
906
907	// assume that there is no data needing converting
908	// left in the converter
909	outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
910
911	while (status == outconvertclass::unfinished)
912	{
913	outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
914	if (len > 0) outstream->write(outbuf, len);
915	}
916
917	return outconverter;
918	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: