source: gsdl/trunk/common-src/src/lib/text_t.cpp@ 18880

Last change on this file since 18880 was 18880, checked in by oranfry, 13 years ago

being more explicit about included libraries for the sake of newer compilers

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.1 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 18880 2009-04-03 02:03:16Z oranfry $
25 *
26 *********************************************************************/
27
28#include "text_t.h"
29
30#if defined(GSDL_USE_OBJECTSPACE)
31# include <ospace\std\algorithm>
32#elif defined(GSDL_USE_STL_H)
33# if defined(GSDL_USE_ALGO_H)
34# include <algo.h>
35# else
36# include <algorithm.h>
37# endif
38#else
39# include <algorithm>
40#endif
41
42#ifdef HAVE_CONFIG_H
43# ifdef __WIN32__
44# include "win32cfg.h"
45# else
46# include "config.h"
47# endif
48#endif
49
50#include <cstring>
51
52#include "unitool.h"
53
54const text_t g_EmptyText("");
55
56////////////////////////////////////
57// text_t methods
58////////////////////////////////////
59
60// new stream converter ...
61ostream& operator<< (ostream &o, const text_t &text)
62{
63 text_t::const_iterator ithere = text.begin();
64 text_t::const_iterator itend = text.end();
65
66 while (ithere != itend)
67 {
68 if (*ithere < 256)
69 {
70 o << (unsigned char)(*ithere);
71 }
72 else
73 {
74 // put a space or a question mark depending on what
75 // the character is. Question marks tell the user that
76 // they are missing some information.
77 if (is_unicode_space (*ithere))
78 o << ' ';
79 else
80 o << '?';
81 }
82 ++ithere;
83 }
84
85 return o;
86}
87
88text_t::text_t ()
89{
90 setencoding(0);
91 clear ();
92}
93
94text_t::text_t (int i)
95{
96 setencoding(0);
97 clear ();
98 appendint (i);
99}
100
101text_t::text_t (const char *s)
102{
103 setencoding(0);
104 clear ();
105 appendcstr (s);
106}
107
108text_t::text_t (const char *s, size_type nLength)
109{
110 setencoding(0);
111 clear ();
112 appendcarr(s, nLength);
113}
114
115
116void text_t::append (const text_t &t)
117{
118 text.insert(text.end(), t.begin(), t.end());
119}
120
121void text_t::appendrange (iterator first, iterator last)
122{
123 text.insert(text.end(), first, last);
124}
125
126void text_t::appendrange (const_iterator first, const_iterator last)
127{
128 text.insert(text.end(), first, last);
129}
130
131void text_t::appendint (int i)
132{
133 // deal with zeros and negatives
134 if (i == 0)
135 {
136 text.push_back('0');
137 return;
138 }
139 else if (i < 0)
140 {
141 text.push_back('-');
142 i *= -1;
143 }
144
145 // get a buffer for the conversion
146 int maxbuflen = sizeof(int)*3;
147 char *buf = new char[maxbuflen];
148 int len = 0;
149
150 // get the number in reverse
151 while (i > 0)
152 {
153 buf[len++] = '0'+ (i%10);
154 i = i/10;
155 }
156
157 // reverse the number
158 while (len > 0)
159 {
160 text.push_back(buf[--len]);
161 }
162
163 delete []buf;
164}
165
166int text_t::getint () const
167{
168 int i = 0;
169 int mult = 1; // become -1 for negative numbers
170
171 const_iterator here = text.begin();
172 const_iterator end = text.end();
173
174 // do plus and minus signs
175 if (here != end)
176 {
177 if (*here == '-')
178 {
179 mult = -1;
180 ++here;
181 }
182 else if (*here == '+')
183 {
184 mult = 1;
185 ++here;
186 }
187 }
188
189 // deal with the number
190 while ((here != end) && (*here >= '0') && (*here <= '9'))
191 {
192 i = 10*i + (*here - '0');
193 ++here;
194 }
195
196 i *= mult;
197 return i;
198}
199
200unsigned long text_t::getulong () const
201{
202 unsigned long i = 0;
203
204 const_iterator here = text.begin();
205 const_iterator end = text.end();
206
207 while ((here != end) && (*here >= '0') && (*here <= '9'))
208 {
209 i = 10*i + (*here - '0');
210 ++here;
211 }
212
213 return i;
214}
215
216void text_t::appendcarr (const char *s, size_type len)
217{
218 unsigned char *us = (unsigned char *)s;
219 if (text.capacity() < (text.size() + len + 2)) {
220 text.reserve(text.size() + len + 2);
221 }
222
223 while (len > 0)
224 {
225 text.push_back (*us); // append this character
226 ++us;
227 --len;
228 }
229}
230
231void text_t::appendcstr (const char *s)
232{
233 size_t len = strlen(s);
234 if (text.capacity() < (text.size() + len + 2)) {
235 text.reserve(text.size() + len + 2);
236 }
237
238 unsigned char *us = (unsigned char *)s;
239 while (*us != '\0')
240 {
241 text.push_back (*us); // append this character
242 ++us;
243 }
244}
245
246
247// strings returned from getcarr and getcstr become the callers
248// responsibility and should be deallocated with "delete []"
249
250char *text_t::getcarr(size_type &len) const
251{
252 unsigned char *cstr = new unsigned char[size()];
253 len = 0;
254
255 const_iterator ithere = begin();
256 const_iterator itend = end();
257 while (ithere != itend)
258 {
259 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
260 else {
261 // put a space or a question mark depending on what
262 // the character is. Question marks tell the user that
263 // they are missing some information.
264 if (is_unicode_space (*ithere)) cstr[len] = ' ';
265 else cstr[len] = '?';
266 }
267 ++len;
268 ++ithere;
269 }
270
271 return (char *)cstr;
272}
273
274char *text_t::getcstr() const
275{
276 unsigned char *cstr = new unsigned char[size() + 1];
277 const_iterator ithere = begin();
278 const_iterator itend = end();
279 int len = 0;
280
281 while (ithere != itend)
282 {
283 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
284 else {
285 // put a space or a question mark depending on what
286 // the character is. Question marks tell the user that
287 // they are missing some information.
288 if (is_unicode_space (*ithere)) cstr[len] = ' ';
289 else cstr[len] = '?';
290 }
291 ++len;
292 ++ithere;
293 }
294
295 cstr[len] = '\0';
296
297 return (char *)cstr;
298}
299
300
301int text_t::replace(text_t toreplace, text_t replacement)
302{
303 // Get the beginning and end of the current text
304 text_t::iterator text_begin = text.begin(), text_end = text.end();
305 int count = 0;
306 text_t new_text, temp_text;
307
308 // Loop through and grab the text off the end
309 while (text_begin < text_end)
310 {
311 // Find where the next toreplace is
312 text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
313
314 // We've found a match
315 if (next_toreplace != text_end)
316 {
317 new_text.append(substr(text_begin, next_toreplace));
318 new_text.append(replacement);
319 count++;
320 text_begin = next_toreplace + toreplace.size();
321 }
322 // We haven't found a match
323 else
324 {
325 new_text.append(substr(text_begin, text_end));
326 text_begin = text_end;
327 }
328 }
329
330 text.clear();
331 text = new_text.text_as_usvector();
332 return count;
333}
334
335
336// general functions which work on text_ts
337
338// find a character within a range
339text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
340 unsigned short c)
341{
342 while (first != last)
343 {
344 if (*first == c) break;
345 ++first;
346 }
347 return first;
348}
349
350text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
351 unsigned short c)
352{
353 while (first != last)
354 {
355 if (*first == c) break;
356 ++first;
357 }
358 return first;
359}
360
361text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
362 unsigned short c)
363{
364 text_t::iterator current = (last_plus_one != first) ? last_plus_one - 1 : first;
365 while (current != first) {
366 if (*current == c) break;
367 --current;
368 }
369 if (current == first) {
370 if (*current == c) return current;
371 return last_plus_one;
372 }
373
374 return current;
375}
376
377text_t::const_iterator findword (text_t::const_iterator first,
378 text_t::const_iterator last,
379 const text_t& word)
380{
381 text_t::const_iterator word_begin = word.begin();
382 text_t::const_iterator word_end = word.end();
383
384 while (first != last)
385 {
386 text_t::const_iterator char_match = first;
387 text_t::const_iterator word_here = word_begin;
388 while (word_here!=word_end)
389 {
390 if (*char_match != *word_here)
391 {
392 break;
393 }
394 ++char_match;
395 ++word_here;
396 }
397 if (word_here==word_end)
398 {
399 return first;
400 }
401 ++first;
402 }
403 return last; // get to here only if there is no match
404}
405
406text_t::iterator findword (text_t::iterator first,
407 text_t::iterator last,
408 const text_t& word)
409{
410 text_t::const_iterator word_begin = word.begin();
411 text_t::const_iterator word_end = word.end();
412
413 while (first != last)
414 {
415 text_t::iterator char_match = first;
416 text_t::const_iterator word_here = word_begin;
417 while (word_here!=word_end)
418 {
419 if (*char_match != *word_here)
420 {
421 break;
422 }
423 ++char_match;
424 ++word_here;
425 }
426 if (word_here==word_end)
427 {
428 return first;
429 }
430 ++first;
431 }
432 return last; // get to here only if there is no match
433}
434
435// get a string up to the next delimiter (which is skipped)
436text_t::const_iterator getdelimitstr (text_t::const_iterator first,
437 text_t::const_iterator last,
438 unsigned short c, text_t &outstr)
439{
440 text_t::const_iterator here = first;
441 here = findchar (first, last, c);
442 outstr.clear();
443 outstr.appendrange (first, here);
444 if (here != last) ++here; // skip c
445 return here;
446}
447
448text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
449 unsigned short c, text_t &outstr)
450{
451 text_t::iterator here = first;
452 here = findchar (first, last, c);
453 outstr.clear();
454 outstr.appendrange (first, here);
455 if (here != last) ++here; // skip c
456 return here;
457}
458
459text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
460 text_t w, text_t &outstr)
461{
462 text_t::const_iterator here = first;
463 here = findword (first, last, w);
464 outstr.clear();
465 outstr.appendrange (first, here);
466 if (here != last) here += w.size(); // skip w
467 return here;
468}
469
470// split a string with a character
471void splitchar (text_t::const_iterator first, text_t::const_iterator last,
472 unsigned short c, text_tset &outlist)
473{
474 outlist.erase(outlist.begin(), outlist.end());
475
476 text_t t;
477
478 while (first != last)
479 {
480 first = getdelimitstr (first, last, c, t);
481 outlist.insert (t);
482 }
483}
484
485void splitchar (text_t::const_iterator first, text_t::const_iterator last,
486 unsigned short c, text_tlist &outlist)
487{
488 outlist.erase(outlist.begin(), outlist.end());
489
490 text_t t;
491
492 while (first != last)
493 {
494 first = getdelimitstr (first, last, c, t);
495 outlist.push_back (t);
496 }
497}
498
499void splitchar (text_t::const_iterator first, text_t::const_iterator last,
500 unsigned short c, text_tarray &outlist)
501{
502 outlist.erase(outlist.begin(), outlist.end());
503
504 text_t t;
505
506 while (first != last)
507 {
508 first = getdelimitstr (first, last, c, t);
509 outlist.push_back (t);
510 }
511}
512
513void splitword (text_t::const_iterator first, text_t::const_iterator last,
514 text_t w, text_tlist &outlist)
515{
516 outlist.erase(outlist.begin(), outlist.end());
517
518 text_t t;
519
520 while (first != last)
521 {
522 first = getdelimitstr (first, last, w, t);
523 outlist.push_back (t);
524 }
525}
526
527// join a string using a character
528void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
529{
530 outtext.clear ();
531
532 text_tset::const_iterator here = inlist.begin ();
533 text_tset::const_iterator end = inlist.end ();
534
535 if (here != end) {
536 outtext += *here; ++here;
537 while (here != end) {
538 outtext.push_back (c);
539 outtext += *here;
540 ++here;
541 }
542 }
543}
544
545void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
546{
547 outtext.clear ();
548
549 text_tlist::const_iterator here = inlist.begin ();
550 text_tlist::const_iterator end = inlist.end ();
551 if (here != end) {
552 outtext += *here; ++here;
553 while (here != end) {
554 outtext.push_back (c);
555 outtext += *here;
556 ++here;
557 }
558 }
559}
560
561void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
562{
563 outtext.clear ();
564
565 text_tarray::const_iterator here = inlist.begin ();
566 text_tarray::const_iterator end = inlist.end ();
567 if (here != end) {
568 outtext += *here; ++here;
569 while (here != end) {
570 outtext.push_back (c);
571 outtext += *here;
572 ++here;
573 }
574 }
575}
576
577void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
578{
579 outtext.clear ();
580
581 text_tlist::const_iterator here = inlist.begin ();
582 text_tlist::const_iterator end = inlist.end ();
583 if (here != end) {
584 outtext += *here; ++here;
585 while (here != end) {
586 outtext += c;
587 outtext += *here;
588 ++here;
589 }
590 }
591}
592
593void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
594{
595 outtext.clear ();
596
597 text_tset::const_iterator here = inlist.begin ();
598 text_tset::const_iterator end = inlist.end ();
599 if (here != end) {
600 outtext += *here; ++here;
601 while (here != end) {
602 outtext += c;
603 outtext += *here;
604 ++here;
605 }
606 }
607}
608
609void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
610{
611 outtext.clear ();
612
613 text_tarray::const_iterator here = inlist.begin ();
614 text_tarray::const_iterator end = inlist.end ();
615 if (here != end) {
616 outtext += *here; ++here;
617 while (here != end) {
618 outtext += c;
619 outtext += *here;
620 ++here;
621 }
622 }
623}
624
625// count the occurances of a character within a range
626int countchar (text_t::const_iterator first, text_t::const_iterator last,
627 unsigned short c)
628{
629 int count = 0;
630 while (first != last) {
631 if (*first == c) ++count;
632 ++first;
633 }
634 return count;
635}
636
637// return a substring of string from first up to but not including last
638text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
639
640 text_t substr; substr.reserve(last - first + 2);
641 while (first != last) {
642 substr.push_back(*first);
643 ++first;
644 }
645 return substr;
646}
647
648
649// convert to lowercase
650void lc (text_t::iterator first, text_t::iterator last) {
651 while (first != last) {
652 *first = unicode_tolower(*first);
653 ++first;
654 }
655}
656
657// convert to uppercase
658void uc (text_t::iterator first, text_t::iterator last) {
659 while (first != last) {
660 *first = unicode_toupper(*first);
661 ++first;
662 }
663}
664
665
666// checks to see if it is a number (i.e. contains only 0-9)
667bool is_number (const text_t &text) {
668
669 text_t::const_iterator here = text.begin();
670 text_t::const_iterator end = text.end();
671
672 while (here != end) {
673 if ((*here!='0') && (*here!='1') && (*here!='2') &&
674 (*here!='3') && (*here!='4') && (*here!='5') &&
675 (*here!='6') && (*here!='7') && (*here!='8') &&
676 (*here!='9')) return false;
677 ++here;
678 }
679 return true;
680}
681
682
683// checks to see if the text has any letters or digits
684bool has_unicode_letdig (const text_t &text) {
685 if (text.empty()) return false;
686
687 text_t::const_iterator here = text.begin();
688 text_t::const_iterator end = text.end();
689 while (here != end) {
690 if (is_unicode_letdig (*here)) return true;
691 ++here;
692 }
693
694 return false;
695}
696
697// checks to see if a text_t starts with the specified prefix
698bool starts_with(const text_t& text, const text_t& prefix) {
699 if (prefix.empty()) return true;
700 if (text.empty() || text.size()<prefix.size()) return false;
701 text_t substring = substr(text.begin(), text.begin()+prefix.size());
702 return substring == prefix;
703}
704// checks to see if a text_t ends with the specified suffix
705bool ends_with(const text_t& text, const text_t& suffix) {
706 if (suffix.empty()) return true;
707 if (text.empty() || text.size() < suffix.size()) return false;
708 text_t substring = substr(text.end()-suffix.size(),text.end());
709 return substring == suffix;
710
711}
712
713
714////////////////////////////////////
715// convertclass methods
716////////////////////////////////////
717
718// conversion classes used for getting information in to and out of
719// the text_t class.
720
721convertclass::convertclass ()
722{
723 // nothing to do
724}
725
726void convertclass::reset ()
727{
728 // nothing to do
729}
730
731
732////////////////////////////////////
733// inconvertclass methods
734////////////////////////////////////
735
736// convert from a char stream to the text_t class
737// the default version assumes the input is a ascii
738// character array
739
740inconvertclass::inconvertclass ()
741{
742 start = NULL;
743 len = 0;
744}
745
746
747void inconvertclass::reset ()
748{
749 start = NULL;
750 len = 0;
751}
752
753void inconvertclass::setinput (char *thestart, size_t thelen)
754{
755 start = thestart;
756 len = thelen;
757}
758
759void inconvertclass::convert (text_t &output, status_t &status)
760{
761 output.clear();
762
763 if (start == NULL || len == 0)
764 {
765 status = finished;
766 return;
767 }
768
769 if (output.capacity() < len + 2)
770 output.reserve(len + 2);
771
772 // don't want any funny sign conversions happening
773 unsigned char *here = (unsigned char *)start;
774 while (len > 0)
775 {
776 output.push_back (*here); // append this character
777 ++here;
778 --len;
779 }
780
781 start = (char *)here; // save current position
782 status = finished;
783}
784
785// will treat the text_t as a 8-bit string and convert
786// it to a 16-bit string using the about convert method.
787text_t inconvertclass::convert (const text_t &t) {
788 text_t out;
789 text_t tmpout;
790 status_t status;
791 text_t::const_iterator here = t.begin();
792 text_t::const_iterator end = t.end();
793 unsigned char cbuf[256];
794 size_t cbuflen = 0;
795
796 out.clear();
797 if (out.capacity() < t.size() + 2)
798 out.reserve(t.size() + 2);
799 while (here != end) {
800 while (here != end && cbuflen < 256) {
801 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
802 ++here;
803 }
804
805 if (cbuflen > 0) {
806 setinput ((char *)cbuf, cbuflen);
807 status = unfinished;
808 while (status == unfinished) {
809 convert (tmpout, status);
810 out += tmpout;
811 }
812 cbuflen = 0;
813 }
814 }
815
816 out.setencoding (0); // unicode
817
818 return out;
819}
820
821// an instance of the default inconvertclass to do simple
822// conversions. Note that any functions that use this are
823// not reentrant. If a function needs to be reentrant it
824// should declare its own instance.
825inconvertclass ascii2text_t;
826
827
828////////////////////////////////////
829// outconvertclass methods
830////////////////////////////////////
831
832// Convert from a text_t class to a char stream
833// This default version assumes the output is a ascii
834// character array. If you set the output stream you
835// can use this class to output to a stream using the
836// << operator. The << operator can also be conveniently
837// used to set the output stream by doing something like
838//
839// cout << text_t2ascii << text_tstr << anothertext_tstr;
840//
841outconvertclass::outconvertclass ()
842{
843 input = NULL;
844 outs = NULL;
845}
846
847void outconvertclass::reset ()
848{
849 input = NULL;
850 outs = NULL;
851}
852
853void outconvertclass::setinput (text_t *theinput)
854{
855 input = theinput;
856 if (input != NULL) texthere = input->begin();
857}
858
859void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
860{
861 input = theinput;
862 texthere = thetexthere;
863}
864
865void outconvertclass::convert (char *output, size_t maxlen,
866 size_t &len, status_t &status)
867{
868 if (input == NULL || output == NULL)
869 {
870 status = finished;
871 return;
872 }
873
874 // don't want any funny sign conversions happening
875 unsigned char *uoutput = (unsigned char *)output;
876 text_t::iterator textend = input->end();
877 len = 0;
878 while ((len < maxlen) && (texthere != textend))
879 {
880 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
881 else {
882 // put a space or a question mark depending on what
883 // the character is. Question marks tell the user that
884 // they are missing some information.
885 if (is_unicode_space (*texthere)) *uoutput = ' ';
886 else *uoutput = '?';
887 }
888 ++uoutput;
889 ++len;
890 ++texthere;
891 }
892
893 if (texthere == textend) status = finished;
894 else status = unfinished;
895}
896
897// will convert the 16-bit string to a 8-bit stream
898// and place the result in a text_t. This method uses
899// the above convert function.
900text_t outconvertclass::convert (const text_t &t) {
901 text_t out;
902 unsigned char cbuf[256];
903 size_t cbuflen = 0;
904 status_t status = unfinished;
905
906 out.clear();
907 if (out.capacity() < t.size() + 2)
908 out.reserve(t.size() + 2);
909 setinput ((text_t *)&t); // discard constant
910 while (status == unfinished) {
911 convert ((char *)cbuf, 256, cbuflen, status);
912 out.appendcarr ((char *)cbuf, cbuflen);
913 }
914
915 out.setencoding (1); // other encoding
916
917 return out;
918}
919
920
921void outconvertclass::setostream (ostream *theouts)
922{
923 outs = theouts;
924}
925
926ostream *outconvertclass::getostream ()
927{
928 return outs;
929}
930
931
932
933
934// an instance of the default outconvertclass to do simple
935// conversions
936outconvertclass text_t2ascii;
937
938
939
940// stream operators for the output class
941
942outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
943{
944 outconverter.setostream(&theouts);
945 return outconverter;
946}
947
948
949#define STREAMBUFSIZE 256
950outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
951{
952 ostream *outstream = outconverter.getostream();
953
954 if (outstream == NULL) return outconverter;
955
956 char outbuf[STREAMBUFSIZE];
957 size_t len;
958 outconvertclass::status_t status = outconvertclass::unfinished;
959
960 // assume that there is no data needing converting
961 // left in the converter
962 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
963
964 while (status == outconvertclass::unfinished)
965 {
966 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
967 if (len > 0) outstream->write(outbuf, len);
968 }
969
970 return outconverter;
971}
Note: See TracBrowser for help on using the repository browser.