source: gsdl/trunk/common-src/src/lib/text_t.cpp@ 18821

Last change on this file since 18821 was 18821, checked in by mdewsnip, 15 years ago

Change to findlastchar() so if the same character is passed as first and last_plus_one values it won't run backwards and likely segfault.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.1 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 18821 2009-03-26 23:16:45Z mdewsnip $
25 *
26 *********************************************************************/
27
28#include "text_t.h"
29
30#if defined(GSDL_USE_OBJECTSPACE)
31# include <ospace\std\algorithm>
32#elif defined(GSDL_USE_STL_H)
33# if defined(GSDL_USE_ALGO_H)
34# include <algo.h>
35# else
36# include <algorithm.h>
37# endif
38#else
39# include <algorithm>
40#endif
41
42#ifdef HAVE_CONFIG_H
43# ifdef __WIN32__
44# include "win32cfg.h"
45# else
46# include "config.h"
47# endif
48#endif
49
50
51#include "unitool.h"
52
53const text_t g_EmptyText("");
54
55////////////////////////////////////
56// text_t methods
57////////////////////////////////////
58
59// new stream converter ...
60ostream& operator<< (ostream &o, const text_t &text)
61{
62 text_t::const_iterator ithere = text.begin();
63 text_t::const_iterator itend = text.end();
64
65 while (ithere != itend)
66 {
67 if (*ithere < 256)
68 {
69 o << (unsigned char)(*ithere);
70 }
71 else
72 {
73 // put a space or a question mark depending on what
74 // the character is. Question marks tell the user that
75 // they are missing some information.
76 if (is_unicode_space (*ithere))
77 o << ' ';
78 else
79 o << '?';
80 }
81 ++ithere;
82 }
83
84 return o;
85}
86
87text_t::text_t ()
88{
89 setencoding(0);
90 clear ();
91}
92
93text_t::text_t (int i)
94{
95 setencoding(0);
96 clear ();
97 appendint (i);
98}
99
100text_t::text_t (const char *s)
101{
102 setencoding(0);
103 clear ();
104 appendcstr (s);
105}
106
107text_t::text_t (const char *s, size_type nLength)
108{
109 setencoding(0);
110 clear ();
111 appendcarr(s, nLength);
112}
113
114
115void text_t::append (const text_t &t)
116{
117 text.insert(text.end(), t.begin(), t.end());
118}
119
120void text_t::appendrange (iterator first, iterator last)
121{
122 text.insert(text.end(), first, last);
123}
124
125void text_t::appendrange (const_iterator first, const_iterator last)
126{
127 text.insert(text.end(), first, last);
128}
129
130void text_t::appendint (int i)
131{
132 // deal with zeros and negatives
133 if (i == 0)
134 {
135 text.push_back('0');
136 return;
137 }
138 else if (i < 0)
139 {
140 text.push_back('-');
141 i *= -1;
142 }
143
144 // get a buffer for the conversion
145 int maxbuflen = sizeof(int)*3;
146 char *buf = new char[maxbuflen];
147 int len = 0;
148
149 // get the number in reverse
150 while (i > 0)
151 {
152 buf[len++] = '0'+ (i%10);
153 i = i/10;
154 }
155
156 // reverse the number
157 while (len > 0)
158 {
159 text.push_back(buf[--len]);
160 }
161
162 delete []buf;
163}
164
165int text_t::getint () const
166{
167 int i = 0;
168 int mult = 1; // become -1 for negative numbers
169
170 const_iterator here = text.begin();
171 const_iterator end = text.end();
172
173 // do plus and minus signs
174 if (here != end)
175 {
176 if (*here == '-')
177 {
178 mult = -1;
179 ++here;
180 }
181 else if (*here == '+')
182 {
183 mult = 1;
184 ++here;
185 }
186 }
187
188 // deal with the number
189 while ((here != end) && (*here >= '0') && (*here <= '9'))
190 {
191 i = 10*i + (*here - '0');
192 ++here;
193 }
194
195 i *= mult;
196 return i;
197}
198
199unsigned long text_t::getulong () const
200{
201 unsigned long i = 0;
202
203 const_iterator here = text.begin();
204 const_iterator end = text.end();
205
206 while ((here != end) && (*here >= '0') && (*here <= '9'))
207 {
208 i = 10*i + (*here - '0');
209 ++here;
210 }
211
212 return i;
213}
214
215void text_t::appendcarr (const char *s, size_type len)
216{
217 unsigned char *us = (unsigned char *)s;
218 if (text.capacity() < (text.size() + len + 2)) {
219 text.reserve(text.size() + len + 2);
220 }
221
222 while (len > 0)
223 {
224 text.push_back (*us); // append this character
225 ++us;
226 --len;
227 }
228}
229
230void text_t::appendcstr (const char *s)
231{
232 size_t len = strlen(s);
233 if (text.capacity() < (text.size() + len + 2)) {
234 text.reserve(text.size() + len + 2);
235 }
236
237 unsigned char *us = (unsigned char *)s;
238 while (*us != '\0')
239 {
240 text.push_back (*us); // append this character
241 ++us;
242 }
243}
244
245
246// strings returned from getcarr and getcstr become the callers
247// responsibility and should be deallocated with "delete []"
248
249char *text_t::getcarr(size_type &len) const
250{
251 unsigned char *cstr = new unsigned char[size()];
252 len = 0;
253
254 const_iterator ithere = begin();
255 const_iterator itend = end();
256 while (ithere != itend)
257 {
258 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
259 else {
260 // put a space or a question mark depending on what
261 // the character is. Question marks tell the user that
262 // they are missing some information.
263 if (is_unicode_space (*ithere)) cstr[len] = ' ';
264 else cstr[len] = '?';
265 }
266 ++len;
267 ++ithere;
268 }
269
270 return (char *)cstr;
271}
272
273char *text_t::getcstr() const
274{
275 unsigned char *cstr = new unsigned char[size() + 1];
276 const_iterator ithere = begin();
277 const_iterator itend = end();
278 int len = 0;
279
280 while (ithere != itend)
281 {
282 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
283 else {
284 // put a space or a question mark depending on what
285 // the character is. Question marks tell the user that
286 // they are missing some information.
287 if (is_unicode_space (*ithere)) cstr[len] = ' ';
288 else cstr[len] = '?';
289 }
290 ++len;
291 ++ithere;
292 }
293
294 cstr[len] = '\0';
295
296 return (char *)cstr;
297}
298
299
300int text_t::replace(text_t toreplace, text_t replacement)
301{
302 // Get the beginning and end of the current text
303 text_t::iterator text_begin = text.begin(), text_end = text.end();
304 int count = 0;
305 text_t new_text, temp_text;
306
307 // Loop through and grab the text off the end
308 while (text_begin < text_end)
309 {
310 // Find where the next toreplace is
311 text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
312
313 // We've found a match
314 if (next_toreplace != text_end)
315 {
316 new_text.append(substr(text_begin, next_toreplace));
317 new_text.append(replacement);
318 count++;
319 text_begin = next_toreplace + toreplace.size();
320 }
321 // We haven't found a match
322 else
323 {
324 new_text.append(substr(text_begin, text_end));
325 text_begin = text_end;
326 }
327 }
328
329 text.clear();
330 text = new_text.text_as_usvector();
331 return count;
332}
333
334
335// general functions which work on text_ts
336
337// find a character within a range
338text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
339 unsigned short c)
340{
341 while (first != last)
342 {
343 if (*first == c) break;
344 ++first;
345 }
346 return first;
347}
348
349text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
350 unsigned short c)
351{
352 while (first != last)
353 {
354 if (*first == c) break;
355 ++first;
356 }
357 return first;
358}
359
360text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
361 unsigned short c)
362{
363 text_t::iterator current = (last_plus_one != first) ? last_plus_one - 1 : first;
364 while (current != first) {
365 if (*current == c) break;
366 --current;
367 }
368 if (current == first) {
369 if (*current == c) return current;
370 return last_plus_one;
371 }
372
373 return current;
374}
375
376text_t::const_iterator findword (text_t::const_iterator first,
377 text_t::const_iterator last,
378 const text_t& word)
379{
380 text_t::const_iterator word_begin = word.begin();
381 text_t::const_iterator word_end = word.end();
382
383 while (first != last)
384 {
385 text_t::const_iterator char_match = first;
386 text_t::const_iterator word_here = word_begin;
387 while (word_here!=word_end)
388 {
389 if (*char_match != *word_here)
390 {
391 break;
392 }
393 ++char_match;
394 ++word_here;
395 }
396 if (word_here==word_end)
397 {
398 return first;
399 }
400 ++first;
401 }
402 return last; // get to here only if there is no match
403}
404
405text_t::iterator findword (text_t::iterator first,
406 text_t::iterator last,
407 const text_t& word)
408{
409 text_t::const_iterator word_begin = word.begin();
410 text_t::const_iterator word_end = word.end();
411
412 while (first != last)
413 {
414 text_t::iterator char_match = first;
415 text_t::const_iterator word_here = word_begin;
416 while (word_here!=word_end)
417 {
418 if (*char_match != *word_here)
419 {
420 break;
421 }
422 ++char_match;
423 ++word_here;
424 }
425 if (word_here==word_end)
426 {
427 return first;
428 }
429 ++first;
430 }
431 return last; // get to here only if there is no match
432}
433
434// get a string up to the next delimiter (which is skipped)
435text_t::const_iterator getdelimitstr (text_t::const_iterator first,
436 text_t::const_iterator last,
437 unsigned short c, text_t &outstr)
438{
439 text_t::const_iterator here = first;
440 here = findchar (first, last, c);
441 outstr.clear();
442 outstr.appendrange (first, here);
443 if (here != last) ++here; // skip c
444 return here;
445}
446
447text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
448 unsigned short c, text_t &outstr)
449{
450 text_t::iterator here = first;
451 here = findchar (first, last, c);
452 outstr.clear();
453 outstr.appendrange (first, here);
454 if (here != last) ++here; // skip c
455 return here;
456}
457
458text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
459 text_t w, text_t &outstr)
460{
461 text_t::const_iterator here = first;
462 here = findword (first, last, w);
463 outstr.clear();
464 outstr.appendrange (first, here);
465 if (here != last) here += w.size(); // skip w
466 return here;
467}
468
469// split a string with a character
470void splitchar (text_t::const_iterator first, text_t::const_iterator last,
471 unsigned short c, text_tset &outlist)
472{
473 outlist.erase(outlist.begin(), outlist.end());
474
475 text_t t;
476
477 while (first != last)
478 {
479 first = getdelimitstr (first, last, c, t);
480 outlist.insert (t);
481 }
482}
483
484void splitchar (text_t::const_iterator first, text_t::const_iterator last,
485 unsigned short c, text_tlist &outlist)
486{
487 outlist.erase(outlist.begin(), outlist.end());
488
489 text_t t;
490
491 while (first != last)
492 {
493 first = getdelimitstr (first, last, c, t);
494 outlist.push_back (t);
495 }
496}
497
498void splitchar (text_t::const_iterator first, text_t::const_iterator last,
499 unsigned short c, text_tarray &outlist)
500{
501 outlist.erase(outlist.begin(), outlist.end());
502
503 text_t t;
504
505 while (first != last)
506 {
507 first = getdelimitstr (first, last, c, t);
508 outlist.push_back (t);
509 }
510}
511
512void splitword (text_t::const_iterator first, text_t::const_iterator last,
513 text_t w, text_tlist &outlist)
514{
515 outlist.erase(outlist.begin(), outlist.end());
516
517 text_t t;
518
519 while (first != last)
520 {
521 first = getdelimitstr (first, last, w, t);
522 outlist.push_back (t);
523 }
524}
525
526// join a string using a character
527void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
528{
529 outtext.clear ();
530
531 text_tset::const_iterator here = inlist.begin ();
532 text_tset::const_iterator end = inlist.end ();
533
534 if (here != end) {
535 outtext += *here; ++here;
536 while (here != end) {
537 outtext.push_back (c);
538 outtext += *here;
539 ++here;
540 }
541 }
542}
543
544void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
545{
546 outtext.clear ();
547
548 text_tlist::const_iterator here = inlist.begin ();
549 text_tlist::const_iterator end = inlist.end ();
550 if (here != end) {
551 outtext += *here; ++here;
552 while (here != end) {
553 outtext.push_back (c);
554 outtext += *here;
555 ++here;
556 }
557 }
558}
559
560void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
561{
562 outtext.clear ();
563
564 text_tarray::const_iterator here = inlist.begin ();
565 text_tarray::const_iterator end = inlist.end ();
566 if (here != end) {
567 outtext += *here; ++here;
568 while (here != end) {
569 outtext.push_back (c);
570 outtext += *here;
571 ++here;
572 }
573 }
574}
575
576void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
577{
578 outtext.clear ();
579
580 text_tlist::const_iterator here = inlist.begin ();
581 text_tlist::const_iterator end = inlist.end ();
582 if (here != end) {
583 outtext += *here; ++here;
584 while (here != end) {
585 outtext += c;
586 outtext += *here;
587 ++here;
588 }
589 }
590}
591
592void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
593{
594 outtext.clear ();
595
596 text_tset::const_iterator here = inlist.begin ();
597 text_tset::const_iterator end = inlist.end ();
598 if (here != end) {
599 outtext += *here; ++here;
600 while (here != end) {
601 outtext += c;
602 outtext += *here;
603 ++here;
604 }
605 }
606}
607
608void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
609{
610 outtext.clear ();
611
612 text_tarray::const_iterator here = inlist.begin ();
613 text_tarray::const_iterator end = inlist.end ();
614 if (here != end) {
615 outtext += *here; ++here;
616 while (here != end) {
617 outtext += c;
618 outtext += *here;
619 ++here;
620 }
621 }
622}
623
624// count the occurances of a character within a range
625int countchar (text_t::const_iterator first, text_t::const_iterator last,
626 unsigned short c)
627{
628 int count = 0;
629 while (first != last) {
630 if (*first == c) ++count;
631 ++first;
632 }
633 return count;
634}
635
636// return a substring of string from first up to but not including last
637text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
638
639 text_t substr; substr.reserve(last - first + 2);
640 while (first != last) {
641 substr.push_back(*first);
642 ++first;
643 }
644 return substr;
645}
646
647
648// convert to lowercase
649void lc (text_t::iterator first, text_t::iterator last) {
650 while (first != last) {
651 *first = unicode_tolower(*first);
652 ++first;
653 }
654}
655
656// convert to uppercase
657void uc (text_t::iterator first, text_t::iterator last) {
658 while (first != last) {
659 *first = unicode_toupper(*first);
660 ++first;
661 }
662}
663
664
665// checks to see if it is a number (i.e. contains only 0-9)
666bool is_number (const text_t &text) {
667
668 text_t::const_iterator here = text.begin();
669 text_t::const_iterator end = text.end();
670
671 while (here != end) {
672 if ((*here!='0') && (*here!='1') && (*here!='2') &&
673 (*here!='3') && (*here!='4') && (*here!='5') &&
674 (*here!='6') && (*here!='7') && (*here!='8') &&
675 (*here!='9')) return false;
676 ++here;
677 }
678 return true;
679}
680
681
682// checks to see if the text has any letters or digits
683bool has_unicode_letdig (const text_t &text) {
684 if (text.empty()) return false;
685
686 text_t::const_iterator here = text.begin();
687 text_t::const_iterator end = text.end();
688 while (here != end) {
689 if (is_unicode_letdig (*here)) return true;
690 ++here;
691 }
692
693 return false;
694}
695
696// checks to see if a text_t starts with the specified prefix
697bool starts_with(const text_t& text, const text_t& prefix) {
698 if (prefix.empty()) return true;
699 if (text.empty() || text.size()<prefix.size()) return false;
700 text_t substring = substr(text.begin(), text.begin()+prefix.size());
701 return substring == prefix;
702}
703// checks to see if a text_t ends with the specified suffix
704bool ends_with(const text_t& text, const text_t& suffix) {
705 if (suffix.empty()) return true;
706 if (text.empty() || text.size() < suffix.size()) return false;
707 text_t substring = substr(text.end()-suffix.size(),text.end());
708 return substring == suffix;
709
710}
711
712
713////////////////////////////////////
714// convertclass methods
715////////////////////////////////////
716
717// conversion classes used for getting information in to and out of
718// the text_t class.
719
720convertclass::convertclass ()
721{
722 // nothing to do
723}
724
725void convertclass::reset ()
726{
727 // nothing to do
728}
729
730
731////////////////////////////////////
732// inconvertclass methods
733////////////////////////////////////
734
735// convert from a char stream to the text_t class
736// the default version assumes the input is a ascii
737// character array
738
739inconvertclass::inconvertclass ()
740{
741 start = NULL;
742 len = 0;
743}
744
745
746void inconvertclass::reset ()
747{
748 start = NULL;
749 len = 0;
750}
751
752void inconvertclass::setinput (char *thestart, size_t thelen)
753{
754 start = thestart;
755 len = thelen;
756}
757
758void inconvertclass::convert (text_t &output, status_t &status)
759{
760 output.clear();
761
762 if (start == NULL || len == 0)
763 {
764 status = finished;
765 return;
766 }
767
768 if (output.capacity() < len + 2)
769 output.reserve(len + 2);
770
771 // don't want any funny sign conversions happening
772 unsigned char *here = (unsigned char *)start;
773 while (len > 0)
774 {
775 output.push_back (*here); // append this character
776 ++here;
777 --len;
778 }
779
780 start = (char *)here; // save current position
781 status = finished;
782}
783
784// will treat the text_t as a 8-bit string and convert
785// it to a 16-bit string using the about convert method.
786text_t inconvertclass::convert (const text_t &t) {
787 text_t out;
788 text_t tmpout;
789 status_t status;
790 text_t::const_iterator here = t.begin();
791 text_t::const_iterator end = t.end();
792 unsigned char cbuf[256];
793 size_t cbuflen = 0;
794
795 out.clear();
796 if (out.capacity() < t.size() + 2)
797 out.reserve(t.size() + 2);
798 while (here != end) {
799 while (here != end && cbuflen < 256) {
800 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
801 ++here;
802 }
803
804 if (cbuflen > 0) {
805 setinput ((char *)cbuf, cbuflen);
806 status = unfinished;
807 while (status == unfinished) {
808 convert (tmpout, status);
809 out += tmpout;
810 }
811 cbuflen = 0;
812 }
813 }
814
815 out.setencoding (0); // unicode
816
817 return out;
818}
819
820// an instance of the default inconvertclass to do simple
821// conversions. Note that any functions that use this are
822// not reentrant. If a function needs to be reentrant it
823// should declare its own instance.
824inconvertclass ascii2text_t;
825
826
827////////////////////////////////////
828// outconvertclass methods
829////////////////////////////////////
830
831// Convert from a text_t class to a char stream
832// This default version assumes the output is a ascii
833// character array. If you set the output stream you
834// can use this class to output to a stream using the
835// << operator. The << operator can also be conveniently
836// used to set the output stream by doing something like
837//
838// cout << text_t2ascii << text_tstr << anothertext_tstr;
839//
840outconvertclass::outconvertclass ()
841{
842 input = NULL;
843 outs = NULL;
844}
845
846void outconvertclass::reset ()
847{
848 input = NULL;
849 outs = NULL;
850}
851
852void outconvertclass::setinput (text_t *theinput)
853{
854 input = theinput;
855 if (input != NULL) texthere = input->begin();
856}
857
858void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
859{
860 input = theinput;
861 texthere = thetexthere;
862}
863
864void outconvertclass::convert (char *output, size_t maxlen,
865 size_t &len, status_t &status)
866{
867 if (input == NULL || output == NULL)
868 {
869 status = finished;
870 return;
871 }
872
873 // don't want any funny sign conversions happening
874 unsigned char *uoutput = (unsigned char *)output;
875 text_t::iterator textend = input->end();
876 len = 0;
877 while ((len < maxlen) && (texthere != textend))
878 {
879 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
880 else {
881 // put a space or a question mark depending on what
882 // the character is. Question marks tell the user that
883 // they are missing some information.
884 if (is_unicode_space (*texthere)) *uoutput = ' ';
885 else *uoutput = '?';
886 }
887 ++uoutput;
888 ++len;
889 ++texthere;
890 }
891
892 if (texthere == textend) status = finished;
893 else status = unfinished;
894}
895
896// will convert the 16-bit string to a 8-bit stream
897// and place the result in a text_t. This method uses
898// the above convert function.
899text_t outconvertclass::convert (const text_t &t) {
900 text_t out;
901 unsigned char cbuf[256];
902 size_t cbuflen = 0;
903 status_t status = unfinished;
904
905 out.clear();
906 if (out.capacity() < t.size() + 2)
907 out.reserve(t.size() + 2);
908 setinput ((text_t *)&t); // discard constant
909 while (status == unfinished) {
910 convert ((char *)cbuf, 256, cbuflen, status);
911 out.appendcarr ((char *)cbuf, cbuflen);
912 }
913
914 out.setencoding (1); // other encoding
915
916 return out;
917}
918
919
920void outconvertclass::setostream (ostream *theouts)
921{
922 outs = theouts;
923}
924
925ostream *outconvertclass::getostream ()
926{
927 return outs;
928}
929
930
931
932
933// an instance of the default outconvertclass to do simple
934// conversions
935outconvertclass text_t2ascii;
936
937
938
939// stream operators for the output class
940
941outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
942{
943 outconverter.setostream(&theouts);
944 return outconverter;
945}
946
947
948#define STREAMBUFSIZE 256
949outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
950{
951 ostream *outstream = outconverter.getostream();
952
953 if (outstream == NULL) return outconverter;
954
955 char outbuf[STREAMBUFSIZE];
956 size_t len;
957 outconvertclass::status_t status = outconvertclass::unfinished;
958
959 // assume that there is no data needing converting
960 // left in the converter
961 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
962
963 while (status == outconvertclass::unfinished)
964 {
965 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
966 if (len > 0) outstream->write(outbuf, len);
967 }
968
969 return outconverter;
970}
Note: See TracBrowser for help on using the repository browser.