source: gsdl/trunk/common-src/src/lib/text_t.cpp@ 20762

Last change on this file since 20762 was 20762, checked in by mdewsnip, 15 years ago

Fixed memory bugs (causing crashes on some Windows machines) in findword(), identified by Valgrind.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.2 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 20762 2009-09-29 22:54:21Z mdewsnip $
25 *
26 *********************************************************************/
27
28#include "text_t.h"
29
30#if defined(GSDL_USE_OBJECTSPACE)
31# include <ospace\std\algorithm>
32#elif defined(GSDL_USE_STL_H)
33# if defined(GSDL_USE_ALGO_H)
34# include <algo.h>
35# else
36# include <algorithm.h>
37# endif
38#else
39# include <algorithm>
40#endif
41
42#ifdef HAVE_CONFIG_H
43# ifdef __WIN32__
44# include "win32cfg.h"
45# else
46# include "config.h"
47# endif
48#endif
49
50#include <cstring>
51
52#include "unitool.h"
53
54const text_t g_EmptyText("");
55
56////////////////////////////////////
57// text_t methods
58////////////////////////////////////
59
60// new stream converter ...
61ostream& operator<< (ostream &o, const text_t &text)
62{
63 text_t::const_iterator ithere = text.begin();
64 text_t::const_iterator itend = text.end();
65
66 while (ithere != itend)
67 {
68 if (*ithere < 256)
69 {
70 o << (unsigned char)(*ithere);
71 }
72 else
73 {
74 // put a space or a question mark depending on what
75 // the character is. Question marks tell the user that
76 // they are missing some information.
77 if (is_unicode_space (*ithere))
78 o << ' ';
79 else
80 o << '?';
81 }
82 ++ithere;
83 }
84
85 return o;
86}
87
88text_t::text_t ()
89{
90 setencoding(0);
91 clear ();
92}
93
94text_t::text_t (int i)
95{
96 setencoding(0);
97 clear ();
98 appendint (i);
99}
100
101text_t::text_t (const char *s)
102{
103 setencoding(0);
104 clear ();
105 appendcstr (s);
106}
107
108text_t::text_t (const char *s, size_type nLength)
109{
110 setencoding(0);
111 clear ();
112 appendcarr(s, nLength);
113}
114
115
116void text_t::append (const text_t &t)
117{
118 text.insert(text.end(), t.begin(), t.end());
119}
120
121void text_t::appendrange (iterator first, iterator last)
122{
123 text.insert(text.end(), first, last);
124}
125
126void text_t::appendrange (const_iterator first, const_iterator last)
127{
128 text.insert(text.end(), first, last);
129}
130
131void text_t::appendint (int i)
132{
133 // deal with zeros and negatives
134 if (i == 0)
135 {
136 text.push_back('0');
137 return;
138 }
139 else if (i < 0)
140 {
141 text.push_back('-');
142 i *= -1;
143 }
144
145 // get a buffer for the conversion
146 int maxbuflen = sizeof(int)*3;
147 char *buf = new char[maxbuflen];
148 int len = 0;
149
150 // get the number in reverse
151 while (i > 0)
152 {
153 buf[len++] = '0'+ (i%10);
154 i = i/10;
155 }
156
157 // reverse the number
158 while (len > 0)
159 {
160 text.push_back(buf[--len]);
161 }
162
163 delete []buf;
164}
165
166int text_t::getint () const
167{
168 int i = 0;
169 int mult = 1; // become -1 for negative numbers
170
171 const_iterator here = text.begin();
172 const_iterator end = text.end();
173
174 // do plus and minus signs
175 if (here != end)
176 {
177 if (*here == '-')
178 {
179 mult = -1;
180 ++here;
181 }
182 else if (*here == '+')
183 {
184 mult = 1;
185 ++here;
186 }
187 }
188
189 // deal with the number
190 while ((here != end) && (*here >= '0') && (*here <= '9'))
191 {
192 i = 10*i + (*here - '0');
193 ++here;
194 }
195
196 i *= mult;
197 return i;
198}
199
200unsigned long text_t::getulong () const
201{
202 unsigned long i = 0;
203
204 const_iterator here = text.begin();
205 const_iterator end = text.end();
206
207 while ((here != end) && (*here >= '0') && (*here <= '9'))
208 {
209 i = 10*i + (*here - '0');
210 ++here;
211 }
212
213 return i;
214}
215
216void text_t::appendcarr (const char *s, size_type len)
217{
218 unsigned char *us = (unsigned char *)s;
219 if (text.capacity() < (text.size() + len + 2)) {
220 text.reserve(text.size() + len + 2);
221 }
222
223 while (len > 0)
224 {
225 text.push_back (*us); // append this character
226 ++us;
227 --len;
228 }
229}
230
231void text_t::appendcstr (const char *s)
232{
233 size_t len = strlen(s);
234 if (text.capacity() < (text.size() + len + 2)) {
235 text.reserve(text.size() + len + 2);
236 }
237
238 unsigned char *us = (unsigned char *)s;
239 while (*us != '\0')
240 {
241 text.push_back (*us); // append this character
242 ++us;
243 }
244}
245
246
247// strings returned from getcarr and getcstr become the callers
248// responsibility and should be deallocated with "delete []"
249
250char *text_t::getcarr(size_type &len) const
251{
252 unsigned char *cstr = new unsigned char[size()];
253 len = 0;
254
255 const_iterator ithere = begin();
256 const_iterator itend = end();
257 while (ithere != itend)
258 {
259 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
260 else {
261 // put a space or a question mark depending on what
262 // the character is. Question marks tell the user that
263 // they are missing some information.
264 if (is_unicode_space (*ithere)) cstr[len] = ' ';
265 else cstr[len] = '?';
266 }
267 ++len;
268 ++ithere;
269 }
270
271 return (char *)cstr;
272}
273
274char *text_t::getcstr() const
275{
276 unsigned char *cstr = new unsigned char[size() + 1];
277 const_iterator ithere = begin();
278 const_iterator itend = end();
279 int len = 0;
280
281 while (ithere != itend)
282 {
283 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
284 else {
285 // put a space or a question mark depending on what
286 // the character is. Question marks tell the user that
287 // they are missing some information.
288 if (is_unicode_space (*ithere)) cstr[len] = ' ';
289 else cstr[len] = '?';
290 }
291 ++len;
292 ++ithere;
293 }
294
295 cstr[len] = '\0';
296
297 return (char *)cstr;
298}
299
300
301int text_t::replace(text_t toreplace, text_t replacement)
302{
303 // Get the beginning and end of the current text
304 text_t::iterator text_begin = text.begin(), text_end = text.end();
305 int count = 0;
306 text_t new_text, temp_text;
307
308 // Loop through and grab the text off the end
309 while (text_begin < text_end)
310 {
311 // Find where the next toreplace is
312 text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
313
314 // We've found a match
315 if (next_toreplace != text_end)
316 {
317 new_text.append(substr(text_begin, next_toreplace));
318 new_text.append(replacement);
319 count++;
320 text_begin = next_toreplace + toreplace.size();
321 }
322 // We haven't found a match
323 else
324 {
325 new_text.append(substr(text_begin, text_end));
326 text_begin = text_end;
327 }
328 }
329
330 text.clear();
331 text = new_text.text_as_usvector();
332 return count;
333}
334
335
336// general functions which work on text_ts
337
338// find a character within a range
339text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
340 unsigned short c)
341{
342 while (first != last)
343 {
344 if (*first == c) break;
345 ++first;
346 }
347 return first;
348}
349
350text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
351 unsigned short c)
352{
353 while (first != last)
354 {
355 if (*first == c) break;
356 ++first;
357 }
358 return first;
359}
360
361text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
362 unsigned short c)
363{
364 text_t::iterator current = (last_plus_one != first) ? last_plus_one - 1 : first;
365 while (current != first) {
366 if (*current == c) break;
367 --current;
368 }
369 if (current == first) {
370 if (*current == c) return current;
371 return last_plus_one;
372 }
373
374 return current;
375}
376
377text_t::const_iterator findword (text_t::const_iterator first,
378 text_t::const_iterator last,
379 const text_t& word)
380{
381 text_t::const_iterator word_begin = word.begin();
382 text_t::const_iterator word_end = word.end();
383
384 while (first != last)
385 {
386 text_t::const_iterator char_match = first;
387 text_t::const_iterator word_here = word_begin;
388 while (word_here != word_end && char_match != last)
389 {
390 if (*char_match != *word_here)
391 {
392 break;
393 }
394 ++char_match;
395 ++word_here;
396 }
397 if (word_here==word_end)
398 {
399 return first;
400 }
401 ++first;
402 }
403 return last; // get to here only if there is no match
404}
405
406text_t::iterator findword (text_t::iterator first,
407 text_t::iterator last,
408 const text_t& word)
409{
410 text_t::const_iterator word_begin = word.begin();
411 text_t::const_iterator word_end = word.end();
412
413 while (first != last)
414 {
415 text_t::iterator char_match = first;
416 text_t::const_iterator word_here = word_begin;
417 while (word_here != word_end && char_match != last)
418 {
419 if (*char_match != *word_here)
420 {
421 break;
422 }
423 ++char_match;
424 ++word_here;
425 }
426 if (word_here==word_end)
427 {
428 return first;
429 }
430 ++first;
431 }
432 return last; // get to here only if there is no match
433}
434
435// get a string up to the next delimiter (which is skipped)
436text_t::const_iterator getdelimitstr (text_t::const_iterator first,
437 text_t::const_iterator last,
438 unsigned short c, text_t &outstr)
439{
440 text_t::const_iterator here = first;
441 here = findchar (first, last, c);
442 outstr.clear();
443 outstr.appendrange (first, here);
444 if (here != last) ++here; // skip c
445 return here;
446}
447
448text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
449 unsigned short c, text_t &outstr)
450{
451 text_t::iterator here = first;
452 here = findchar (first, last, c);
453 outstr.clear();
454 outstr.appendrange (first, here);
455 if (here != last) ++here; // skip c
456 return here;
457}
458
459text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
460 text_t w, text_t &outstr)
461{
462 text_t::const_iterator here = first;
463 here = findword (first, last, w);
464 outstr.clear();
465 outstr.appendrange (first, here);
466 if (here != last) here += w.size(); // skip w
467 return here;
468}
469
470// split a string with a character
471void splitchar (text_t::const_iterator first, text_t::const_iterator last,
472 unsigned short c, text_tset &outlist)
473{
474 outlist.erase(outlist.begin(), outlist.end());
475
476 text_t t;
477
478 while (first != last)
479 {
480 first = getdelimitstr (first, last, c, t);
481 outlist.insert (t);
482 }
483}
484
485void splitchar (text_t::const_iterator first, text_t::const_iterator last,
486 unsigned short c, text_tlist &outlist)
487{
488 outlist.erase(outlist.begin(), outlist.end());
489
490 text_t t;
491
492 while (first != last)
493 {
494 first = getdelimitstr (first, last, c, t);
495 outlist.push_back (t);
496 }
497}
498
499void splitchar (text_t::const_iterator first, text_t::const_iterator last,
500 unsigned short c, text_tarray &outlist)
501{
502 outlist.erase(outlist.begin(), outlist.end());
503
504 text_t t;
505
506 while (first != last)
507 {
508 first = getdelimitstr (first, last, c, t);
509 outlist.push_back (t);
510 }
511}
512
513void splitword (text_t::const_iterator first, text_t::const_iterator last,
514 text_t w, text_tlist &outlist)
515{
516 outlist.erase(outlist.begin(), outlist.end());
517
518 text_t t;
519
520 while (first != last)
521 {
522 first = getdelimitstr (first, last, w, t);
523 outlist.push_back (t);
524 }
525}
526
527// join a string using a character
528void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
529{
530 outtext.clear ();
531
532 text_tset::const_iterator here = inlist.begin ();
533 text_tset::const_iterator end = inlist.end ();
534
535 if (here != end) {
536 outtext += *here; ++here;
537 while (here != end) {
538 outtext.push_back (c);
539 outtext += *here;
540 ++here;
541 }
542 }
543}
544
545void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
546{
547 outtext.clear ();
548
549 text_tlist::const_iterator here = inlist.begin ();
550 text_tlist::const_iterator end = inlist.end ();
551 if (here != end) {
552 outtext += *here; ++here;
553 while (here != end) {
554 outtext.push_back (c);
555 outtext += *here;
556 ++here;
557 }
558 }
559}
560
561void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
562{
563 outtext.clear ();
564
565 text_tarray::const_iterator here = inlist.begin ();
566 text_tarray::const_iterator end = inlist.end ();
567 if (here != end) {
568 outtext += *here; ++here;
569 while (here != end) {
570 outtext.push_back (c);
571 outtext += *here;
572 ++here;
573 }
574 }
575}
576
577void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
578{
579 outtext.clear ();
580
581 text_tlist::const_iterator here = inlist.begin ();
582 text_tlist::const_iterator end = inlist.end ();
583 if (here != end) {
584 outtext += *here; ++here;
585 while (here != end) {
586 outtext += c;
587 outtext += *here;
588 ++here;
589 }
590 }
591}
592
593void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
594{
595 outtext.clear ();
596
597 text_tset::const_iterator here = inlist.begin ();
598 text_tset::const_iterator end = inlist.end ();
599 if (here != end) {
600 outtext += *here; ++here;
601 while (here != end) {
602 outtext += c;
603 outtext += *here;
604 ++here;
605 }
606 }
607}
608
609void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
610{
611 outtext.clear ();
612
613 text_tarray::const_iterator here = inlist.begin ();
614 text_tarray::const_iterator end = inlist.end ();
615 if (here != end) {
616 outtext += *here; ++here;
617 while (here != end) {
618 outtext += c;
619 outtext += *here;
620 ++here;
621 }
622 }
623}
624
625// count the occurances of a character within a range
626int countchar (text_t::const_iterator first, text_t::const_iterator last,
627 unsigned short c)
628{
629 int count = 0;
630 while (first != last) {
631 if (*first == c) ++count;
632 ++first;
633 }
634 return count;
635}
636
637// return a substring of string from first up to but not including last
638text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
639
640 text_t substr; substr.reserve(last - first + 2);
641 while (first != last) {
642 substr.push_back(*first);
643 ++first;
644 }
645 return substr;
646}
647
648
649// convert to lowercase
650void lc (text_t::iterator first, text_t::iterator last) {
651 while (first != last) {
652 *first = unicode_tolower(*first);
653 ++first;
654 }
655}
656
657// convert to uppercase
658void uc (text_t::iterator first, text_t::iterator last) {
659 while (first != last) {
660 *first = unicode_toupper(*first);
661 ++first;
662 }
663}
664
665
666// checks to see if it is a number (i.e. contains only 0-9)
667bool is_number (const text_t &text) {
668
669 text_t::const_iterator here = text.begin();
670 text_t::const_iterator end = text.end();
671
672 while (here != end) {
673 if ((*here!='0') && (*here!='1') && (*here!='2') &&
674 (*here!='3') && (*here!='4') && (*here!='5') &&
675 (*here!='6') && (*here!='7') && (*here!='8') &&
676 (*here!='9')) return false;
677 ++here;
678 }
679 return true;
680}
681
682
683// checks to see if the text has any letters or digits
684bool has_unicode_letdig (const text_t &text) {
685 if (text.empty()) return false;
686
687 text_t::const_iterator here = text.begin();
688 text_t::const_iterator end = text.end();
689 while (here != end) {
690 if (is_unicode_letdig (*here)) return true;
691 ++here;
692 }
693
694 return false;
695}
696
697// checks to see if a text_t starts with the specified prefix
698bool starts_with(const text_t& text, const text_t& prefix) {
699 if (prefix.empty()) return true;
700 if (text.empty() || text.size()<prefix.size()) return false;
701 text_t substring = substr(text.begin(), text.begin()+prefix.size());
702 return substring == prefix;
703}
704// checks to see if a text_t ends with the specified suffix
705bool ends_with(const text_t& text, const text_t& suffix) {
706 if (suffix.empty()) return true;
707 if (text.empty() || text.size() < suffix.size()) return false;
708 text_t substring = substr(text.end()-suffix.size(),text.end());
709 return substring == suffix;
710
711}
712
713
714////////////////////////////////////
715// convertclass methods
716////////////////////////////////////
717
718// conversion classes used for getting information in to and out of
719// the text_t class.
720
721convertclass::convertclass ()
722{
723 // nothing to do
724}
725
726void convertclass::reset ()
727{
728 // nothing to do
729}
730
731
732////////////////////////////////////
733// inconvertclass methods
734////////////////////////////////////
735
736// convert from a char stream to the text_t class
737// the default version assumes the input is a ascii
738// character array
739
740inconvertclass::inconvertclass ()
741{
742 start = NULL;
743 len = 0;
744}
745
746
747void inconvertclass::reset ()
748{
749 start = NULL;
750 len = 0;
751}
752
753void inconvertclass::setinput (char *thestart, size_t thelen)
754{
755 start = thestart;
756 len = thelen;
757}
758
759void inconvertclass::convert (text_t &output, status_t &status)
760{
761 output.clear();
762
763 if (start == NULL || len == 0)
764 {
765 status = finished;
766 return;
767 }
768
769 if (output.capacity() < len + 2)
770 output.reserve(len + 2);
771
772 // don't want any funny sign conversions happening
773 unsigned char *here = (unsigned char *)start;
774 while (len > 0)
775 {
776 output.push_back (*here); // append this character
777 ++here;
778 --len;
779 }
780
781 start = (char *)here; // save current position
782 status = finished;
783}
784
785// will treat the text_t as a 8-bit string and convert
786// it to a 16-bit string using the about convert method.
787text_t inconvertclass::convert (const text_t &t) {
788 text_t out;
789 text_t tmpout;
790 status_t status;
791 text_t::const_iterator here = t.begin();
792 text_t::const_iterator end = t.end();
793 unsigned char cbuf[256];
794 size_t cbuflen = 0;
795
796 out.clear();
797 if (out.capacity() < t.size() + 2)
798 out.reserve(t.size() + 2);
799 while (here != end) {
800 while (here != end && cbuflen < 256) {
801 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
802 ++here;
803 }
804
805 if (cbuflen > 0) {
806 setinput ((char *)cbuf, cbuflen);
807 status = unfinished;
808 while (status == unfinished) {
809 convert (tmpout, status);
810 out += tmpout;
811 }
812 cbuflen = 0;
813 }
814 }
815
816 out.setencoding (0); // unicode
817
818 return out;
819}
820
821// an instance of the default inconvertclass to do simple
822// conversions. Note that any functions that use this are
823// not reentrant. If a function needs to be reentrant it
824// should declare its own instance.
825inconvertclass ascii2text_t;
826
827
828////////////////////////////////////
829// outconvertclass methods
830////////////////////////////////////
831
832// Convert from a text_t class to a char stream
833// This default version assumes the output is a ascii
834// character array. If you set the output stream you
835// can use this class to output to a stream using the
836// << operator. The << operator can also be conveniently
837// used to set the output stream by doing something like
838//
839// cout << text_t2ascii << text_tstr << anothertext_tstr;
840//
841outconvertclass::outconvertclass ()
842{
843 input = NULL;
844 outs = NULL;
845}
846
847void outconvertclass::reset ()
848{
849 input = NULL;
850 outs = NULL;
851}
852
853void outconvertclass::setinput (text_t *theinput)
854{
855 input = theinput;
856 if (input != NULL) texthere = input->begin();
857}
858
859void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
860{
861 input = theinput;
862 texthere = thetexthere;
863}
864
865void outconvertclass::convert (char *output, size_t maxlen,
866 size_t &len, status_t &status)
867{
868 if (input == NULL || output == NULL)
869 {
870 status = finished;
871 return;
872 }
873
874 // don't want any funny sign conversions happening
875 unsigned char *uoutput = (unsigned char *)output;
876 text_t::iterator textend = input->end();
877 len = 0;
878 while ((len < maxlen) && (texthere != textend))
879 {
880 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
881 else {
882 // put a space or a question mark depending on what
883 // the character is. Question marks tell the user that
884 // they are missing some information.
885 if (is_unicode_space (*texthere)) *uoutput = ' ';
886 else *uoutput = '?';
887 }
888 ++uoutput;
889 ++len;
890 ++texthere;
891 }
892
893 if (texthere == textend) status = finished;
894 else status = unfinished;
895}
896
897// will convert the 16-bit string to a 8-bit stream
898// and place the result in a text_t. This method uses
899// the above convert function.
900text_t outconvertclass::convert (const text_t &t) {
901 text_t out;
902 unsigned char cbuf[256];
903 size_t cbuflen = 0;
904 status_t status = unfinished;
905
906 out.clear();
907 if (out.capacity() < t.size() + 2)
908 out.reserve(t.size() + 2);
909 setinput ((text_t *)&t); // discard constant
910 while (status == unfinished) {
911 convert ((char *)cbuf, 256, cbuflen, status);
912 out.appendcarr ((char *)cbuf, cbuflen);
913 }
914
915 out.setencoding (1); // other encoding
916
917 return out;
918}
919
920
921void outconvertclass::setostream (ostream *theouts)
922{
923 outs = theouts;
924}
925
926ostream *outconvertclass::getostream ()
927{
928 return outs;
929}
930
931
932
933
934// an instance of the default outconvertclass to do simple
935// conversions
936outconvertclass text_t2ascii;
937
938
939
940// stream operators for the output class
941
942outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
943{
944 outconverter.setostream(&theouts);
945 return outconverter;
946}
947
948
949#define STREAMBUFSIZE 256
950outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
951{
952 ostream *outstream = outconverter.getostream();
953
954 if (outstream == NULL) return outconverter;
955
956 char outbuf[STREAMBUFSIZE];
957 size_t len;
958 outconvertclass::status_t status = outconvertclass::unfinished;
959
960 // assume that there is no data needing converting
961 // left in the converter
962 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
963
964 while (status == outconvertclass::unfinished)
965 {
966 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
967 if (len > 0) outstream->write(outbuf, len);
968 }
969
970 return outconverter;
971}
Note: See TracBrowser for help on using the repository browser.