source: gsdl/trunk/lib/text_t.cpp@ 14909

Last change on this file since 14909 was 14909, checked in by davidb, 16 years ago

Standardisation of Windows config file to lowercase (included from this source file). Was causing a problem when trying to compile on Unix filesystem mounted under Windows.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.8 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 14909 2007-12-11 20:39:26Z davidb $
25 *
26 *********************************************************************/
27
28#include "text_t.h"
29
30#if defined(GSDL_USE_OBJECTSPACE)
31# include <ospace\std\algorithm>
32#elif defined(GSDL_USE_STL_H)
33# if defined(GSDL_USE_ALGO_H)
34# include <algo.h>
35# else
36# include <algorithm.h>
37# endif
38#else
39# include <algorithm>
40#endif
41
42#ifdef HAVE_CONFIG_H
43# ifdef __WIN32__
44# include "win32cfg.h"
45# else
46# include "config.h"
47# endif
48#endif
49
50
51#include "unitool.h"
52
53const text_t g_EmptyText("");
54
55////////////////////////////////////
56// text_t methods
57////////////////////////////////////
58
59// new stream converter ...
60ostream& operator<< (ostream &o, const text_t &text)
61{
62 text_t::const_iterator ithere = text.begin();
63 text_t::const_iterator itend = text.end();
64
65 while (ithere != itend)
66 {
67 if (*ithere < 256)
68 {
69 o << (unsigned char)(*ithere);
70 }
71 else
72 {
73 // put a space or a question mark depending on what
74 // the character is. Question marks tell the user that
75 // they are missing some information.
76 if (is_unicode_space (*ithere))
77 o << ' ';
78 else
79 o << '?';
80 }
81 ++ithere;
82 }
83
84 return o;
85}
86
87text_t::text_t ()
88{
89 setencoding(0);
90 clear ();
91}
92
93text_t::text_t (int i)
94{
95 setencoding(0);
96 clear ();
97 appendint (i);
98}
99
100text_t::text_t (const char *s)
101{
102 setencoding(0);
103 clear ();
104 appendcstr (s);
105}
106
107text_t::text_t (const char *s, size_type nLength)
108{
109 setencoding(0);
110 clear ();
111 appendcarr(s, nLength);
112}
113
114
115void text_t::append (const text_t &t)
116{
117 text.insert(text.end(), t.begin(), t.end());
118}
119
120void text_t::appendrange (iterator first, iterator last)
121{
122 text.insert(text.end(), first, last);
123}
124
125void text_t::appendrange (const_iterator first, const_iterator last)
126{
127 text.insert(text.end(), first, last);
128}
129
130void text_t::appendint (int i)
131{
132 // deal with zeros and negatives
133 if (i == 0)
134 {
135 text.push_back('0');
136 return;
137 }
138 else if (i < 0)
139 {
140 text.push_back('-');
141 i *= -1;
142 }
143
144 // get a buffer for the conversion
145 int maxbuflen = sizeof(int)*3;
146 char *buf = new char[maxbuflen];
147 int len = 0;
148
149 // get the number in reverse
150 while (i > 0)
151 {
152 buf[len++] = '0'+ (i%10);
153 i = i/10;
154 }
155
156 // reverse the number
157 while (len > 0)
158 {
159 text.push_back(buf[--len]);
160 }
161
162 delete []buf;
163}
164
165int text_t::getint () const
166{
167 int i = 0;
168 int mult = 1; // become -1 for negative numbers
169
170 const_iterator here = text.begin();
171 const_iterator end = text.end();
172
173 // do plus and minus signs
174 if (here != end)
175 {
176 if (*here == '-')
177 {
178 mult = -1;
179 ++here;
180 }
181 else if (*here == '+')
182 {
183 mult = 1;
184 ++here;
185 }
186 }
187
188 // deal with the number
189 while ((here != end) && (*here >= '0') && (*here <= '9'))
190 {
191 i = 10*i + (*here - '0');
192 ++here;
193 }
194
195 i *= mult;
196 return i;
197}
198
199unsigned long text_t::getulong () const
200{
201 unsigned long i = 0;
202
203 const_iterator here = text.begin();
204 const_iterator end = text.end();
205
206 while ((here != end) && (*here >= '0') && (*here <= '9'))
207 {
208 i = 10*i + (*here - '0');
209 ++here;
210 }
211
212 return i;
213}
214
215void text_t::appendcarr (const char *s, size_type len)
216{
217 unsigned char *us = (unsigned char *)s;
218 if (text.capacity() < (text.size() + len + 2)) {
219 text.reserve(text.size() + len + 2);
220 }
221
222 while (len > 0)
223 {
224 text.push_back (*us); // append this character
225 ++us;
226 --len;
227 }
228}
229
230void text_t::appendcstr (const char *s)
231{
232 size_t len = strlen(s);
233 if (text.capacity() < (text.size() + len + 2)) {
234 text.reserve(text.size() + len + 2);
235 }
236
237 unsigned char *us = (unsigned char *)s;
238 while (*us != '\0')
239 {
240 text.push_back (*us); // append this character
241 ++us;
242 }
243}
244
245
246// strings returned from getcarr and getcstr become the callers
247// responsibility and should be deallocated with "delete []"
248
249char *text_t::getcarr(size_type &len) const
250{
251 unsigned char *cstr = new unsigned char[size()];
252 len = 0;
253
254 const_iterator ithere = begin();
255 const_iterator itend = end();
256 while (ithere != itend)
257 {
258 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
259 else {
260 // put a space or a question mark depending on what
261 // the character is. Question marks tell the user that
262 // they are missing some information.
263 if (is_unicode_space (*ithere)) cstr[len] = ' ';
264 else cstr[len] = '?';
265 }
266 ++len;
267 ++ithere;
268 }
269
270 return (char *)cstr;
271}
272
273char *text_t::getcstr() const
274{
275 unsigned char *cstr = new unsigned char[size() + 1];
276 const_iterator ithere = begin();
277 const_iterator itend = end();
278 int len = 0;
279
280 while (ithere != itend)
281 {
282 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
283 else {
284 // put a space or a question mark depending on what
285 // the character is. Question marks tell the user that
286 // they are missing some information.
287 if (is_unicode_space (*ithere)) cstr[len] = ' ';
288 else cstr[len] = '?';
289 }
290 ++len;
291 ++ithere;
292 }
293
294 cstr[len] = '\0';
295
296 return (char *)cstr;
297}
298
299
300int text_t::replace(text_t toreplace, text_t replacement)
301{
302 // Get the beginning and end of the current text
303 text_t::iterator text_begin = text.begin(), text_end = text.end();
304 int count = 0;
305 text_t new_text, temp_text;
306
307 // Loop through and grab the text off the end
308 while (text_begin < text_end)
309 {
310 // Find where the next toreplace is
311 text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
312
313 // Grab the string up to it
314 temp_text = substr(text_begin, next_toreplace);
315
316 // Add the new string onto the end
317 if (new_text.empty())
318 {
319 new_text.append(temp_text);
320 }
321 else
322 {
323 new_text.append(replacement + temp_text);
324 }
325
326 // Finally, we need to move the current pointer up to the new position
327 text_begin = next_toreplace + 1;
328 count++;
329 }
330
331 text.clear();
332 text = new_text.text_as_usvector();
333 return count;
334}
335
336
337// general functions which work on text_ts
338
339// find a character within a range
340text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
341 unsigned short c)
342{
343 while (first != last)
344 {
345 if (*first == c) break;
346 ++first;
347 }
348 return first;
349}
350
351text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
352 unsigned short c)
353{
354 while (first != last)
355 {
356 if (*first == c) break;
357 ++first;
358 }
359 return first;
360}
361
362text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last,
363 unsigned short c)
364{
365 text_t::iterator current = last;
366 while (current != first) {
367 if (*current == c) break;
368 --current;
369 }
370 if (current == first) {
371 if (*current == c) return current;
372 return last;
373 }
374
375 return current;
376}
377
378text_t::iterator findword (text_t::iterator first,
379 text_t::iterator last,
380 const text_t& word)
381{
382 text_t::const_iterator word_begin = word.begin();
383 text_t::const_iterator word_end = word.end();
384
385 while (first != last)
386 {
387 text_t::iterator char_match = first;
388 text_t::const_iterator word_here = word_begin;
389 while (word_here!=word_end)
390 {
391 if (*char_match != *word_here)
392 {
393 break;
394 }
395 ++char_match;
396 ++word_here;
397 }
398 if (word_here==word_end)
399 {
400 return first;
401 }
402 ++first;
403 }
404 return last; // get to here only if there is no match
405}
406
407// get a string up to the next delimiter (which is skipped)
408text_t::const_iterator getdelimitstr (text_t::const_iterator first,
409 text_t::const_iterator last,
410 unsigned short c, text_t &outstr)
411{
412 text_t::const_iterator here = first;
413 here = findchar (first, last, c);
414 outstr.clear();
415 outstr.appendrange (first, here);
416 if (here != last) ++here; // skip c
417 return here;
418}
419
420text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
421 unsigned short c, text_t &outstr)
422{
423 text_t::iterator here = first;
424 here = findchar (first, last, c);
425 outstr.clear();
426 outstr.appendrange (first, here);
427 if (here != last) ++here; // skip c
428 return here;
429}
430
431// split a string with a character
432void splitchar (text_t::const_iterator first, text_t::const_iterator last,
433 unsigned short c, text_tset &outlist)
434{
435 outlist.erase(outlist.begin(), outlist.end());
436
437 text_t t;
438
439 while (first != last)
440 {
441 first = getdelimitstr (first, last, c, t);
442 outlist.insert (t);
443 }
444}
445
446void splitchar (text_t::const_iterator first, text_t::const_iterator last,
447 unsigned short c, text_tlist &outlist)
448{
449 outlist.erase(outlist.begin(), outlist.end());
450
451 text_t t;
452
453 while (first != last)
454 {
455 first = getdelimitstr (first, last, c, t);
456 outlist.push_back (t);
457 }
458}
459
460void splitchar (text_t::const_iterator first, text_t::const_iterator last,
461 unsigned short c, text_tarray &outlist)
462{
463 outlist.erase(outlist.begin(), outlist.end());
464
465 text_t t;
466
467 while (first != last)
468 {
469 first = getdelimitstr (first, last, c, t);
470 outlist.push_back (t);
471 }
472}
473
474// join a string using a character
475void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
476{
477 outtext.clear ();
478
479 text_tset::const_iterator here = inlist.begin ();
480 text_tset::const_iterator end = inlist.end ();
481
482 if (here != end) {
483 outtext += *here; ++here;
484 while (here != end) {
485 outtext.push_back (c);
486 outtext += *here;
487 ++here;
488 }
489 }
490}
491
492void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
493{
494 outtext.clear ();
495
496 text_tlist::const_iterator here = inlist.begin ();
497 text_tlist::const_iterator end = inlist.end ();
498 if (here != end) {
499 outtext += *here; ++here;
500 while (here != end) {
501 outtext.push_back (c);
502 outtext += *here;
503 ++here;
504 }
505 }
506}
507
508void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
509{
510 outtext.clear ();
511
512 text_tarray::const_iterator here = inlist.begin ();
513 text_tarray::const_iterator end = inlist.end ();
514 if (here != end) {
515 outtext += *here; ++here;
516 while (here != end) {
517 outtext.push_back (c);
518 outtext += *here;
519 ++here;
520 }
521 }
522}
523
524void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
525{
526 outtext.clear ();
527
528 text_tlist::const_iterator here = inlist.begin ();
529 text_tlist::const_iterator end = inlist.end ();
530 if (here != end) {
531 outtext += *here; ++here;
532 while (here != end) {
533 outtext += c;
534 outtext += *here;
535 ++here;
536 }
537 }
538}
539
540void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
541{
542 outtext.clear ();
543
544 text_tset::const_iterator here = inlist.begin ();
545 text_tset::const_iterator end = inlist.end ();
546 if (here != end) {
547 outtext += *here; ++here;
548 while (here != end) {
549 outtext += c;
550 outtext += *here;
551 ++here;
552 }
553 }
554}
555
556void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
557{
558 outtext.clear ();
559
560 text_tarray::const_iterator here = inlist.begin ();
561 text_tarray::const_iterator end = inlist.end ();
562 if (here != end) {
563 outtext += *here; ++here;
564 while (here != end) {
565 outtext += c;
566 outtext += *here;
567 ++here;
568 }
569 }
570}
571
572// count the occurances of a character within a range
573int countchar (text_t::const_iterator first, text_t::const_iterator last,
574 unsigned short c)
575{
576 int count = 0;
577 while (first != last) {
578 if (*first == c) ++count;
579 ++first;
580 }
581 return count;
582}
583
584// return a substring of string from first up to but not including last
585text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
586
587 text_t substr; substr.reserve(last - first + 2);
588 while (first != last) {
589 substr.push_back(*first);
590 ++first;
591 }
592 return substr;
593}
594
595
596// convert to lowercase
597void lc (text_t::iterator first, text_t::iterator last) {
598 while (first != last) {
599 *first = unicode_tolower(*first);
600 ++first;
601 }
602}
603
604// convert to uppercase
605void uc (text_t::iterator first, text_t::iterator last) {
606 while (first != last) {
607 *first = unicode_toupper(*first);
608 ++first;
609 }
610}
611
612
613// checks to see if it is a number (i.e. contains only 0-9)
614bool is_number (const text_t &text) {
615
616 text_t::const_iterator here = text.begin();
617 text_t::const_iterator end = text.end();
618
619 while (here != end) {
620 if ((*here!='0') && (*here!='1') && (*here!='2') &&
621 (*here!='3') && (*here!='4') && (*here!='5') &&
622 (*here!='6') && (*here!='7') && (*here!='8') &&
623 (*here!='9')) return false;
624 ++here;
625 }
626 return true;
627}
628
629
630// checks to see if the text has any letters or digits
631bool has_unicode_letdig (const text_t &text) {
632 if (text.empty()) return false;
633
634 text_t::const_iterator here = text.begin();
635 text_t::const_iterator end = text.end();
636 while (here != end) {
637 if (is_unicode_letdig (*here)) return true;
638 ++here;
639 }
640
641 return false;
642}
643
644// checks to see if a text_t starts with the specified prefix
645bool starts_with(const text_t& text, const text_t& prefix) {
646 if (prefix.empty()) return true;
647 if (text.empty() || text.size()<prefix.size()) return false;
648 text_t substring = substr(text.begin(), text.begin()+prefix.size());
649 return substring == prefix;
650}
651// checks to see if a text_t ends with the specified suffix
652bool ends_with(const text_t& text, const text_t& suffix) {
653 if (suffix.empty()) return true;
654 if (text.empty() || text.size() < suffix.size()) return false;
655 text_t substring = substr(text.end()-suffix.size(),text.end());
656 return substring == suffix;
657
658}
659
660
661////////////////////////////////////
662// convertclass methods
663////////////////////////////////////
664
665// conversion classes used for getting information in to and out of
666// the text_t class.
667
668convertclass::convertclass ()
669{
670 // nothing to do
671}
672
673void convertclass::reset ()
674{
675 // nothing to do
676}
677
678
679////////////////////////////////////
680// inconvertclass methods
681////////////////////////////////////
682
683// convert from a char stream to the text_t class
684// the default version assumes the input is a ascii
685// character array
686
687inconvertclass::inconvertclass ()
688{
689 start = NULL;
690 len = 0;
691}
692
693
694void inconvertclass::reset ()
695{
696 start = NULL;
697 len = 0;
698}
699
700void inconvertclass::setinput (char *thestart, size_t thelen)
701{
702 start = thestart;
703 len = thelen;
704}
705
706void inconvertclass::convert (text_t &output, status_t &status)
707{
708 output.clear();
709
710 if (start == NULL || len == 0)
711 {
712 status = finished;
713 return;
714 }
715
716 if (output.capacity() < len + 2)
717 output.reserve(len + 2);
718
719 // don't want any funny sign conversions happening
720 unsigned char *here = (unsigned char *)start;
721 while (len > 0)
722 {
723 output.push_back (*here); // append this character
724 ++here;
725 --len;
726 }
727
728 start = (char *)here; // save current position
729 status = finished;
730}
731
732// will treat the text_t as a 8-bit string and convert
733// it to a 16-bit string using the about convert method.
734text_t inconvertclass::convert (const text_t &t) {
735 text_t out;
736 text_t tmpout;
737 status_t status;
738 text_t::const_iterator here = t.begin();
739 text_t::const_iterator end = t.end();
740 unsigned char cbuf[256];
741 size_t cbuflen = 0;
742
743 out.clear();
744 if (out.capacity() < t.size() + 2)
745 out.reserve(t.size() + 2);
746 while (here != end) {
747 while (here != end && cbuflen < 256) {
748 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
749 ++here;
750 }
751
752 if (cbuflen > 0) {
753 setinput ((char *)cbuf, cbuflen);
754 status = unfinished;
755 while (status == unfinished) {
756 convert (tmpout, status);
757 out += tmpout;
758 }
759 cbuflen = 0;
760 }
761 }
762
763 out.setencoding (0); // unicode
764
765 return out;
766}
767
768// an instance of the default inconvertclass to do simple
769// conversions. Note that any functions that use this are
770// not reentrant. If a function needs to be reentrant it
771// should declare its own instance.
772inconvertclass ascii2text_t;
773
774
775////////////////////////////////////
776// outconvertclass methods
777////////////////////////////////////
778
779// Convert from a text_t class to a char stream
780// This default version assumes the output is a ascii
781// character array. If you set the output stream you
782// can use this class to output to a stream using the
783// << operator. The << operator can also be conveniently
784// used to set the output stream by doing something like
785//
786// cout << text_t2ascii << text_tstr << anothertext_tstr;
787//
788outconvertclass::outconvertclass ()
789{
790 input = NULL;
791 outs = NULL;
792}
793
794void outconvertclass::reset ()
795{
796 input = NULL;
797 outs = NULL;
798}
799
800void outconvertclass::setinput (text_t *theinput)
801{
802 input = theinput;
803 if (input != NULL) texthere = input->begin();
804}
805
806void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
807{
808 input = theinput;
809 texthere = thetexthere;
810}
811
812void outconvertclass::convert (char *output, size_t maxlen,
813 size_t &len, status_t &status)
814{
815 if (input == NULL || output == NULL)
816 {
817 status = finished;
818 return;
819 }
820
821 // don't want any funny sign conversions happening
822 unsigned char *uoutput = (unsigned char *)output;
823 text_t::iterator textend = input->end();
824 len = 0;
825 while ((len < maxlen) && (texthere != textend))
826 {
827 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
828 else {
829 // put a space or a question mark depending on what
830 // the character is. Question marks tell the user that
831 // they are missing some information.
832 if (is_unicode_space (*texthere)) *uoutput = ' ';
833 else *uoutput = '?';
834 }
835 ++uoutput;
836 ++len;
837 ++texthere;
838 }
839
840 if (texthere == textend) status = finished;
841 else status = unfinished;
842}
843
844// will convert the 16-bit string to a 8-bit stream
845// and place the result in a text_t. This method uses
846// the above convert function.
847text_t outconvertclass::convert (const text_t &t) {
848 text_t out;
849 unsigned char cbuf[256];
850 size_t cbuflen = 0;
851 status_t status = unfinished;
852
853 out.clear();
854 if (out.capacity() < t.size() + 2)
855 out.reserve(t.size() + 2);
856 setinput ((text_t *)&t); // discard constant
857 while (status == unfinished) {
858 convert ((char *)cbuf, 256, cbuflen, status);
859 out.appendcarr ((char *)cbuf, cbuflen);
860 }
861
862 out.setencoding (1); // other encoding
863
864 return out;
865}
866
867
868void outconvertclass::setostream (ostream *theouts)
869{
870 outs = theouts;
871}
872
873ostream *outconvertclass::getostream ()
874{
875 return outs;
876}
877
878
879
880
881// an instance of the default outconvertclass to do simple
882// conversions
883outconvertclass text_t2ascii;
884
885
886
887// stream operators for the output class
888
889outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
890{
891 outconverter.setostream(&theouts);
892 return outconverter;
893}
894
895
896#define STREAMBUFSIZE 256
897outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
898{
899 ostream *outstream = outconverter.getostream();
900
901 if (outstream == NULL) return outconverter;
902
903 char outbuf[STREAMBUFSIZE];
904 size_t len;
905 outconvertclass::status_t status = outconvertclass::unfinished;
906
907 // assume that there is no data needing converting
908 // left in the converter
909 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
910
911 while (status == outconvertclass::unfinished)
912 {
913 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
914 if (len > 0) outstream->write(outbuf, len);
915 }
916
917 return outconverter;
918}
Note: See TracBrowser for help on using the repository browser.