root/gsdl/trunk/common-src/src/lib/text_t.cpp @ 18700

Revision 18700, 21.1 KB (checked in by kjdon, 11 years ago)

findlastchar was dereferencing the last iterator (end()), so now it decrements that before dereferencing. parameter name changed to last_plus_one to hopefully inform user of this.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id$
25 *
26 *********************************************************************/
27
28#include "text_t.h"
29
30#if defined(GSDL_USE_OBJECTSPACE)
31#  include <ospace\std\algorithm>
32#elif defined(GSDL_USE_STL_H)
33#  if defined(GSDL_USE_ALGO_H)
34#    include <algo.h>
35#  else
36#    include <algorithm.h>
37#  endif
38#else
39#  include <algorithm>
40#endif
41
42#ifdef HAVE_CONFIG_H
43# ifdef __WIN32__
44#  include "win32cfg.h"
45# else
46#  include "config.h"
47# endif
48#endif
49
50
51#include "unitool.h"
52
53const text_t g_EmptyText("");
54
55////////////////////////////////////
56// text_t methods
57////////////////////////////////////
58
59// new stream converter ...
60ostream& operator<< (ostream &o, const text_t &text)
61{
62  text_t::const_iterator ithere = text.begin();
63  text_t::const_iterator itend = text.end();
64
65  while (ithere != itend)
66    {
67      if (*ithere < 256)
68    {
69      o << (unsigned char)(*ithere);
70    }
71      else
72    {
73    // put a space or a question mark depending on what
74    // the character is. Question marks tell the user that
75    // they are missing some information.
76    if (is_unicode_space (*ithere))
77      o << ' ';
78    else
79      o << '?';
80      }
81      ++ithere;
82    }
83
84  return o;
85}
86
87text_t::text_t ()
88{
89  setencoding(0);
90  clear ();
91}
92
93text_t::text_t (int i)
94{
95  setencoding(0);
96  clear ();
97  appendint (i);
98}
99
100text_t::text_t (const char *s)
101{
102  setencoding(0);
103  clear ();
104  appendcstr (s);
105}
106
107text_t::text_t (const char *s, size_type nLength)
108{
109  setencoding(0);
110  clear ();
111  appendcarr(s, nLength);
112}
113
114
115void text_t::append (const text_t &t)
116{
117  text.insert(text.end(), t.begin(), t.end());
118}
119
120void text_t::appendrange (iterator first, iterator last)
121{
122  text.insert(text.end(), first, last);
123}
124
125void text_t::appendrange (const_iterator first, const_iterator last)
126{
127  text.insert(text.end(), first, last);
128}
129
130void text_t::appendint (int i)
131{
132  // deal with zeros and negatives
133  if (i == 0)
134    {
135      text.push_back('0');
136      return;
137    }
138  else if (i < 0)
139    {
140      text.push_back('-');
141      i *= -1;
142    }
143
144  // get a buffer for the conversion
145  int maxbuflen = sizeof(int)*3;
146  char *buf = new char[maxbuflen];
147  int len = 0;
148 
149  // get the number in reverse
150  while (i > 0)
151    {
152      buf[len++] = '0'+ (i%10);
153      i = i/10;
154    }
155
156  // reverse the number
157  while (len > 0)
158    {
159      text.push_back(buf[--len]);
160    }
161
162  delete []buf;
163}
164
165int text_t::getint () const
166{
167  int i = 0;
168  int mult = 1; // become -1 for negative numbers
169
170  const_iterator here = text.begin();
171  const_iterator end = text.end();
172 
173  // do plus and minus signs
174  if (here != end)
175    {
176      if (*here == '-')
177    {
178      mult = -1;
179      ++here;
180    }
181      else if (*here == '+')
182    {
183      mult = 1;
184      ++here;
185    }
186    }
187
188  // deal with the number
189  while ((here != end) && (*here >= '0') && (*here <= '9'))
190    {
191      i = 10*i + (*here - '0');
192      ++here;
193    }
194
195  i *= mult;
196  return i;
197}
198
199unsigned long text_t::getulong () const
200{
201  unsigned long i = 0;
202
203  const_iterator here = text.begin();
204  const_iterator end = text.end();
205
206  while ((here != end) && (*here >= '0') && (*here <= '9'))
207    {
208      i = 10*i + (*here - '0');
209      ++here;
210    }
211
212  return i;
213}
214
215void text_t::appendcarr (const char *s, size_type len)
216{
217  unsigned char *us = (unsigned char *)s;
218  if (text.capacity() < (text.size() + len + 2)) {
219    text.reserve(text.size() + len + 2);
220  }
221
222  while (len > 0)
223    {
224      text.push_back (*us); // append this character
225      ++us;
226      --len;
227    }
228}
229
230void text_t::appendcstr (const char *s)
231{
232  size_t len = strlen(s);
233  if (text.capacity() < (text.size() + len + 2)) {
234    text.reserve(text.size() + len + 2);
235  }
236 
237  unsigned char *us = (unsigned char *)s;
238  while (*us != '\0')
239    {
240      text.push_back (*us); // append this character
241      ++us;
242    }
243}
244
245
246// strings returned from getcarr and getcstr become the callers
247// responsibility and should be deallocated with "delete []"
248
249char *text_t::getcarr(size_type &len) const
250{
251  unsigned char *cstr = new unsigned char[size()];
252  len = 0;
253
254  const_iterator ithere = begin();
255  const_iterator itend = end();
256  while (ithere != itend)
257    {
258      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
259      else {
260    // put a space or a question mark depending on what
261    // the character is. Question marks tell the user that
262    // they are missing some information.
263    if (is_unicode_space (*ithere)) cstr[len] = ' ';
264    else cstr[len] = '?';
265      }
266      ++len;
267      ++ithere;
268    }
269
270  return (char *)cstr;
271}
272
273char *text_t::getcstr() const
274{
275  unsigned char *cstr = new unsigned char[size() + 1];
276  const_iterator ithere = begin();
277  const_iterator itend = end();
278  int len = 0;
279
280  while (ithere != itend)
281    {
282      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
283      else {
284    // put a space or a question mark depending on what
285    // the character is. Question marks tell the user that
286    // they are missing some information.
287    if (is_unicode_space (*ithere)) cstr[len] = ' ';
288    else cstr[len] = '?';
289      }
290      ++len;
291      ++ithere;
292    }
293
294  cstr[len] = '\0';
295
296  return (char *)cstr;
297}
298
299
300int text_t::replace(text_t toreplace, text_t replacement)
301{
302  // Get the beginning and end of the current text
303  text_t::iterator text_begin = text.begin(), text_end = text.end();
304  int count = 0;
305  text_t new_text, temp_text;
306
307  // Loop through and grab the text off the end
308  while (text_begin < text_end)
309  {
310    // Find where the next toreplace is
311    text_t::iterator next_toreplace = findword(text_begin, text_end, toreplace);
312
313    // We've found a match
314    if (next_toreplace != text_end)
315    {
316      new_text.append(substr(text_begin, next_toreplace));
317      new_text.append(replacement);
318      count++;
319      text_begin = next_toreplace + toreplace.size();
320    }
321    // We haven't found a match
322    else
323    {
324      new_text.append(substr(text_begin, text_end));
325      text_begin = text_end;
326    }
327  }
328
329  text.clear();
330  text = new_text.text_as_usvector();
331  return count;
332}
333
334
335// general functions which work on text_ts
336
337// find a character within a range
338text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
339                 unsigned short c)
340{
341  while (first != last)
342    {
343      if (*first == c) break;
344      ++first;
345    }
346  return first;
347}
348
349text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
350               unsigned short c)
351{
352  while (first != last)
353    {
354      if (*first == c) break;
355      ++first;
356    }
357  return first;
358}
359
360text_t::iterator findlastchar (text_t::iterator first, text_t::iterator last_plus_one,
361               unsigned short c)
362{
363  text_t::iterator current = last_plus_one - 1;
364  while (current != first) {
365    if (*current == c) break;
366    --current;
367  }
368  if (current == first) {
369    if (*current == c) return current;
370    return last_plus_one;
371  }
372
373  return current;
374}
375
376text_t::const_iterator findword (text_t::const_iterator first,
377                 text_t::const_iterator last,
378                 const text_t& word)
379{
380  text_t::const_iterator word_begin = word.begin();
381  text_t::const_iterator word_end = word.end();
382
383  while (first != last)
384    {
385      text_t::const_iterator char_match = first;
386      text_t::const_iterator word_here = word_begin;
387      while (word_here!=word_end)
388    {
389      if (*char_match != *word_here)
390        {
391          break;
392        }
393      ++char_match;
394      ++word_here;
395    }
396      if (word_here==word_end)
397    {
398      return first;
399    }
400      ++first;
401    }
402  return last; // get to here only if there is no match
403}
404
405text_t::iterator findword (text_t::iterator first,
406               text_t::iterator last,
407               const text_t& word)
408{
409  text_t::const_iterator word_begin = word.begin();
410  text_t::const_iterator word_end = word.end();
411
412  while (first != last)
413    {
414      text_t::iterator char_match = first;
415      text_t::const_iterator word_here = word_begin;
416      while (word_here!=word_end)
417    {
418      if (*char_match != *word_here)
419        {
420          break;
421        }
422      ++char_match;
423      ++word_here;
424    }
425      if (word_here==word_end)
426    {
427      return first;
428    }
429      ++first;
430    }
431  return last; // get to here only if there is no match
432}
433
434// get a string up to the next delimiter (which is skipped)
435text_t::const_iterator getdelimitstr (text_t::const_iterator first,
436                      text_t::const_iterator last,
437                      unsigned short c, text_t &outstr)
438{
439  text_t::const_iterator here = first;
440  here = findchar (first, last, c);
441  outstr.clear();
442  outstr.appendrange (first, here);
443  if (here != last) ++here; // skip c
444  return here;
445}
446
447text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
448                unsigned short c, text_t &outstr)
449{
450  text_t::iterator here = first;
451  here = findchar (first, last, c);
452  outstr.clear();
453  outstr.appendrange (first, here);
454  if (here != last) ++here; // skip c
455  return here;
456}
457
458text_t::const_iterator getdelimitstr (text_t::const_iterator first, text_t::const_iterator last,
459                      text_t w, text_t &outstr)
460{
461  text_t::const_iterator here = first;
462  here = findword (first, last, w);
463  outstr.clear();
464  outstr.appendrange (first, here);
465  if (here != last) here += w.size(); // skip w
466  return here;
467}
468
469// split a string with a character
470void splitchar (text_t::const_iterator first, text_t::const_iterator last,
471        unsigned short c, text_tset &outlist)
472{
473  outlist.erase(outlist.begin(), outlist.end());
474
475  text_t t;
476
477  while (first != last)
478    {
479      first = getdelimitstr (first, last, c, t);
480      outlist.insert (t);
481    }
482}
483
484void splitchar (text_t::const_iterator first, text_t::const_iterator last,
485        unsigned short c, text_tlist &outlist)
486{
487  outlist.erase(outlist.begin(), outlist.end());
488
489  text_t t;
490
491  while (first != last)
492    {
493      first = getdelimitstr (first, last, c, t);
494      outlist.push_back (t);
495    }
496}
497
498void splitchar (text_t::const_iterator first, text_t::const_iterator last,
499        unsigned short c, text_tarray &outlist)
500{
501  outlist.erase(outlist.begin(), outlist.end());
502
503  text_t t;
504
505  while (first != last)
506    {
507      first = getdelimitstr (first, last, c, t);
508      outlist.push_back (t);
509    }
510}
511
512void splitword (text_t::const_iterator first, text_t::const_iterator last,
513        text_t w, text_tlist &outlist)
514{
515  outlist.erase(outlist.begin(), outlist.end());
516
517  text_t t;
518
519  while (first != last)
520    {
521      first = getdelimitstr (first, last, w, t);
522      outlist.push_back (t);
523    }
524}
525
526// join a string using a character
527void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
528{
529  outtext.clear ();
530
531  text_tset::const_iterator here = inlist.begin ();
532  text_tset::const_iterator end = inlist.end ();
533
534  if (here != end) {
535    outtext += *here; ++here;
536    while (here != end) {
537      outtext.push_back (c);
538      outtext += *here;
539      ++here;
540    }
541  }
542}
543
544void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
545{
546  outtext.clear ();
547
548  text_tlist::const_iterator here = inlist.begin ();
549  text_tlist::const_iterator end = inlist.end ();
550  if (here != end) {
551    outtext += *here; ++here;
552    while (here != end) {
553      outtext.push_back (c);
554      outtext += *here;
555      ++here;
556    }
557  }
558}
559
560void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
561{
562  outtext.clear ();
563
564  text_tarray::const_iterator here = inlist.begin ();
565  text_tarray::const_iterator end = inlist.end ();
566  if (here != end) {
567    outtext += *here; ++here;
568    while (here != end) {
569      outtext.push_back (c);
570      outtext += *here;
571      ++here;
572    }
573  }
574}
575
576void joinchar (const text_tlist &inlist, const text_t &c, text_t &outtext)
577{
578  outtext.clear ();
579
580  text_tlist::const_iterator here = inlist.begin ();
581  text_tlist::const_iterator end = inlist.end ();
582  if (here != end) {
583    outtext += *here; ++here;
584    while (here != end) {
585      outtext += c;
586      outtext += *here;
587      ++here;
588    }
589  }
590}
591
592void joinchar (const text_tset &inlist, const text_t &c, text_t &outtext)
593{
594  outtext.clear ();
595
596  text_tset::const_iterator here = inlist.begin ();
597  text_tset::const_iterator end = inlist.end ();
598  if (here != end) {
599    outtext += *here; ++here;
600    while (here != end) {
601      outtext += c;
602      outtext += *here;
603      ++here;
604    }
605  }
606}
607
608void joinchar (const text_tarray &inlist, const text_t &c, text_t &outtext)
609{
610  outtext.clear ();
611
612  text_tarray::const_iterator here = inlist.begin ();
613  text_tarray::const_iterator end = inlist.end ();
614  if (here != end) {
615    outtext += *here; ++here;
616    while (here != end) {
617      outtext += c;
618      outtext += *here;
619      ++here;
620    }
621  }
622}
623
624// count the occurances of a character within a range
625int countchar (text_t::const_iterator first, text_t::const_iterator last,
626           unsigned short c)
627{
628  int count = 0;
629  while (first != last) {
630    if (*first == c) ++count;
631    ++first;
632  }
633  return count;
634}
635
636// return a substring of string from first up to but not including last
637text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
638
639  text_t substr; substr.reserve(last - first + 2);
640  while (first != last) {
641    substr.push_back(*first);
642    ++first;
643  }
644  return substr;
645}
646
647
648// convert to lowercase
649void lc (text_t::iterator first, text_t::iterator last) {
650  while (first != last) {
651    *first = unicode_tolower(*first);
652    ++first;
653  }
654}
655
656// convert to uppercase
657void uc (text_t::iterator first, text_t::iterator last) {
658  while (first != last) {
659    *first = unicode_toupper(*first);
660    ++first;
661  }
662}
663
664
665// checks to see if it is a number (i.e. contains only 0-9)
666bool is_number (const text_t &text) {
667
668  text_t::const_iterator here = text.begin();
669  text_t::const_iterator end = text.end();
670
671  while (here != end) {
672    if ((*here!='0') && (*here!='1') && (*here!='2') &&
673    (*here!='3') && (*here!='4') && (*here!='5') &&
674    (*here!='6') && (*here!='7') && (*here!='8') &&
675    (*here!='9')) return false;
676    ++here;
677  }
678  return true;
679}
680
681
682// checks to see if the text has any letters or digits
683bool has_unicode_letdig (const text_t &text) {
684  if (text.empty()) return false;
685 
686  text_t::const_iterator here = text.begin();
687  text_t::const_iterator end = text.end();
688  while (here != end) {
689    if (is_unicode_letdig (*here)) return true;
690    ++here;
691  }
692
693  return false;
694}
695
696// checks to see if a text_t starts with the specified prefix
697bool starts_with(const text_t& text, const text_t& prefix) {
698  if (prefix.empty()) return true;
699  if (text.empty() || text.size()<prefix.size()) return false;
700  text_t substring = substr(text.begin(), text.begin()+prefix.size());
701  return substring == prefix;
702}
703// checks to see if a text_t ends with the specified suffix
704bool ends_with(const text_t& text, const text_t& suffix) {
705  if (suffix.empty()) return true;
706  if (text.empty() || text.size() < suffix.size()) return false;
707  text_t substring = substr(text.end()-suffix.size(),text.end());
708  return substring == suffix;
709
710}
711
712
713////////////////////////////////////
714// convertclass methods
715////////////////////////////////////
716
717// conversion classes used for getting information in to and out of
718// the text_t class.
719
720convertclass::convertclass ()
721{
722  // nothing to do
723}
724
725void convertclass::reset ()
726{
727  // nothing to do
728}
729
730
731////////////////////////////////////
732// inconvertclass methods
733////////////////////////////////////
734
735// convert from a char stream to the text_t class
736// the default version assumes the input is a ascii
737// character array
738
739inconvertclass::inconvertclass ()
740{
741  start = NULL;
742  len = 0;
743}
744
745
746void inconvertclass::reset ()
747{
748  start = NULL;
749  len = 0;
750}
751
752void inconvertclass::setinput (char *thestart, size_t thelen)
753{
754  start = thestart;
755  len = thelen;
756}
757
758void inconvertclass::convert (text_t &output, status_t &status)
759{
760  output.clear();
761
762  if (start == NULL || len == 0)
763    {
764      status = finished;
765      return;
766    }
767
768  if (output.capacity() < len + 2)
769    output.reserve(len + 2);
770 
771  // don't want any funny sign conversions happening
772  unsigned char *here = (unsigned char *)start;
773  while (len > 0)
774    {
775      output.push_back (*here); // append this character
776      ++here;
777      --len;
778    }
779
780  start = (char *)here; // save current position
781  status = finished;
782}
783
784// will treat the text_t as a 8-bit string and convert
785// it to a 16-bit string using the about convert method.
786text_t inconvertclass::convert (const text_t &t) {
787  text_t out;
788  text_t tmpout;
789  status_t status;
790  text_t::const_iterator here = t.begin();
791  text_t::const_iterator end = t.end();
792  unsigned char cbuf[256];
793  size_t cbuflen = 0;
794 
795  out.clear();
796  if (out.capacity() < t.size() + 2)
797    out.reserve(t.size() + 2);
798  while (here != end) {
799    while (here != end && cbuflen < 256) {
800      cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
801      ++here;
802    }
803
804    if (cbuflen > 0) {
805      setinput ((char *)cbuf, cbuflen);
806      status = unfinished;
807      while (status == unfinished) {
808    convert (tmpout, status);
809    out += tmpout;
810      }
811      cbuflen = 0;
812    }
813  }
814
815  out.setencoding (0); // unicode
816
817  return out;
818}
819
820// an instance of the default inconvertclass to do simple
821// conversions. Note that any functions that use this are
822// not reentrant. If a function needs to be reentrant it
823// should declare its own instance.
824inconvertclass ascii2text_t;
825
826
827////////////////////////////////////
828// outconvertclass methods
829////////////////////////////////////
830
831// Convert from a text_t class to a char stream
832// This default version assumes the output is a ascii
833// character array. If you set the output stream you
834// can use this class to output to a stream using the
835// << operator. The << operator can also be conveniently
836// used to set the output stream by doing something like
837//
838// cout << text_t2ascii << text_tstr << anothertext_tstr;
839//
840outconvertclass::outconvertclass ()
841{
842  input = NULL;
843  outs = NULL;
844}
845
846void outconvertclass::reset ()
847{
848  input = NULL;
849  outs = NULL;
850}
851
852void outconvertclass::setinput (text_t *theinput)
853{
854  input = theinput;
855  if (input != NULL) texthere = input->begin();
856}
857
858void outconvertclass::setdata(text_t *theinput, text_t::iterator thetexthere)
859{
860  input = theinput;     
861  texthere = thetexthere;
862}
863 
864void outconvertclass::convert (char *output, size_t maxlen,
865              size_t &len, status_t &status)
866{
867  if (input == NULL || output == NULL)
868    {
869      status = finished;
870      return;
871    }
872
873  // don't want any funny sign conversions happening
874  unsigned char *uoutput = (unsigned char *)output;
875  text_t::iterator textend = input->end();
876  len = 0;
877  while ((len < maxlen) && (texthere != textend))
878    {
879      if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
880      else {
881    // put a space or a question mark depending on what
882    // the character is. Question marks tell the user that
883    // they are missing some information.
884    if (is_unicode_space (*texthere)) *uoutput = ' ';
885    else *uoutput = '?';
886      }
887      ++uoutput;
888      ++len;
889      ++texthere;
890    }
891 
892  if (texthere == textend) status = finished;
893  else status = unfinished;
894}
895
896// will convert the 16-bit string to a 8-bit stream
897// and place the result in a text_t. This method uses
898// the above convert function.
899text_t outconvertclass::convert (const text_t &t) {
900  text_t out;
901  unsigned char cbuf[256];
902  size_t cbuflen = 0;
903  status_t status = unfinished;
904 
905  out.clear();
906  if (out.capacity() < t.size() + 2)
907    out.reserve(t.size() + 2);
908  setinput ((text_t *)&t); // discard constant
909  while (status == unfinished) {
910    convert ((char *)cbuf, 256, cbuflen, status);
911    out.appendcarr ((char *)cbuf, cbuflen);
912  }
913
914  out.setencoding (1); // other encoding
915 
916  return out;
917}
918
919
920void outconvertclass::setostream (ostream *theouts)
921{
922  outs = theouts;
923}
924
925ostream *outconvertclass::getostream ()
926{
927  return outs;
928}
929
930
931
932
933// an instance of the default outconvertclass to do simple
934// conversions
935outconvertclass text_t2ascii;
936
937
938
939// stream operators for the output class
940
941outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
942{
943  outconverter.setostream(&theouts);
944  return outconverter;
945}
946
947
948#define STREAMBUFSIZE 256
949outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
950{
951  ostream *outstream = outconverter.getostream();
952
953  if (outstream == NULL) return outconverter;
954
955  char outbuf[STREAMBUFSIZE];
956  size_t len;
957  outconvertclass::status_t status = outconvertclass::unfinished;
958
959  // assume that there is no data needing converting
960  // left in the converter
961  outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
962
963  while (status == outconvertclass::unfinished)
964    {
965      outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
966      if (len > 0) outstream->write(outbuf, len);
967    }
968
969  return outconverter;
970}
Note: See TracBrowser for help on using the browser.