root/branches/New_Config_Format-branch/gsdl/lib/text_t.cpp @ 1279

Revision 1279, 17.6 KB (checked in by sjboddie, 20 years ago)

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999  The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id$
25 *
26 *********************************************************************/
27
28/*
29   $Log$
30   Revision 1.17.2.1  2000/07/12 22:20:56  sjboddie
31   merged changes to trunk into New_Config_Format branch
32
33   Revision 1.18  2000/04/14 02:50:12  sjboddie
34   added text_t versions of joinchar to work with sets and lists
35
36   Revision 1.17  2000/04/06 19:58:03  cs025
37   Correcting a correction - reinstated all lib files due to silly
38   CVS confusion.
39
40   Revision 1.15  1999/10/14 22:52:39  sjboddie
41   joinchar can join using text_t string now too
42
43   Revision 1.14  1999/09/24 02:30:03  rjmcnab
44   added function has_unicode_letdig
45
46   Revision 1.13  1999/09/07 04:57:43  sjboddie
47   added gpl notice
48
49   Revision 1.12  1999/08/31 08:04:41  rjmcnab
50   Fixed a small but hard to find bug in getcarr
51
52   Revision 1.11  1999/07/01 04:05:09  rjmcnab
53   Optimised append functions slightly and added a reserve function.
54
55   Revision 1.10  1999/04/26 03:58:03  sjboddie
56   added is_number function
57
58   Revision 1.9  1999/04/06 22:17:24  rjmcnab
59   Added splits and joins using text_tset.
60
61   Revision 1.8  1999/02/28 23:14:41  rjmcnab
62
63   Added uc and lc to convert to uppercase and lowercase.
64
65   Revision 1.7  1999/02/21 22:26:39  rjmcnab
66
67   Made getint() a constant function.
68
69   Revision 1.6  1999/02/03 01:13:26  sjboddie
70
71   Got interface to handle subcollections and language subcollections -
72   committed changes made to some of the collections
73
74   Revision 1.5  1999/01/19 01:38:14  rjmcnab
75
76   Made the source more portable.
77
78   Revision 1.4  1999/01/12 01:51:00  rjmcnab
79
80   Standard header.
81
82   Revision 1.3  1999/01/08 02:33:16  rjmcnab
83
84   Added standard header to source files.
85
86 */
87
88
89#include "text_t.h"
90
91#if defined(GSDL_USE_OBJECTSPACE)
92#  include <ospace\std\algorithm>
93#elif defined(GSDL_USE_STL_H)
94#  if defined(GSDL_USE_ALGO_H)
95#    include <algo.h>
96#  else
97#    include <algorithm.h>
98#  endif
99#else
100#  include <algorithm>
101#endif
102
103
104#include "unitool.h"
105
106////////////////////////////////////
107// text_t methods
108////////////////////////////////////
109
110text_t::text_t ()
111{
112  setencoding(0);
113  clear ();
114}
115
116text_t::text_t (int i)
117{
118  setencoding(0);
119  clear ();
120  appendint (i);
121}
122
123text_t::text_t (char *s)
124{
125  setencoding(0);
126  clear ();
127  appendcstr (s);
128}
129
130void text_t::append (const text_t &t)
131{
132  text.insert(text.end(), t.begin(), t.end());
133  //  const_iterator here, end=t.end();
134  //  for (here=t.begin(); here!=end;here++)
135  //    {
136  //      text.push_back(*here);
137  //    }
138}
139
140void text_t::appendrange (iterator first, iterator last)
141{
142  text.insert(text.end(), first, last);
143  //  while (first != last)
144  //  {
145  //    text.push_back (*first);
146  //    first++;
147  //  }
148}
149
150void text_t::appendrange (const_iterator first, const_iterator last)
151{
152  text.insert(text.end(), first, last);
153  //  while (first != last)
154  //  {
155  //    text.push_back (*first);
156  //    first++;
157  //  }
158}
159
160void text_t::appendint (int i)
161{
162  // deal with zeros and negatives
163  if (i == 0)
164    {
165      text.push_back('0');
166      return;
167    }
168  else if (i < 0)
169    {
170      text.push_back('-');
171      i *= -1;
172    }
173
174  // get a buffer for the conversion
175  int maxbuflen = sizeof(int)*3;
176  char *buf = new char[maxbuflen];
177  int len = 0;
178 
179  // get the number in reverse
180  while (i > 0)
181    {
182      buf[len++] = '0'+ (i%10);
183      i = i/10;
184    }
185
186  // reverse the number
187  while (len > 0)
188    {
189      text.push_back(buf[--len]);
190    }
191
192  delete buf;
193}
194
195int text_t::getint () const
196{
197  int i = 0;
198  int mult = 1; // become -1 for negative numbers
199
200  const_iterator here = text.begin();
201  const_iterator end = text.end();
202 
203  // do plus and minus signs
204  if (here != end)
205    {
206      if (*here == '-')
207    {
208      mult = -1;
209      here++;
210    }
211      else if (*here == '+')
212    {
213      mult = 1;
214      here++;
215    }
216    }
217
218  // deal with the number
219  while ((here != end) && (*here >= '0') && (*here <= '9'))
220    {
221      i = 10*i + (*here - '0');
222      here++;
223    }
224
225  i *= mult;
226  return i;
227}
228
229
230
231void text_t::appendcarr (char *s, size_type len)
232{
233  unsigned char *us = (unsigned char *)s;
234  while (len > 0)
235    {
236      text.push_back (*us); // append this character
237      us++;
238      len--;
239    }
240}
241
242void text_t::appendcstr (char *s)
243{
244  unsigned char *us = (unsigned char *)s;
245  while (*us != '\0')
246    {
247      text.push_back (*us); // append this character
248      us++;
249    }
250}
251
252
253// strings returned from getcarr and getcstr become the callers
254// responsibility and should be deallocated with "delete"
255
256char *text_t::getcarr(size_type &len) const
257{
258  unsigned char *cstr = new unsigned char[size()];
259  len = 0;
260
261  const_iterator ithere = begin();
262  const_iterator itend = end();
263  while (ithere != itend)
264    {
265      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
266      else {
267    // put a space or a question mark depending on what
268    // the character is. Question marks tell the user that
269    // they are missing some information.
270    if (is_unicode_space (*ithere)) cstr[len] = ' ';
271    else cstr[len] = '?';
272      }
273      len++;
274      ithere++;
275    }
276
277  return (char *)cstr;
278}
279
280char *text_t::getcstr() const
281{
282  unsigned char *cstr = new unsigned char[size() + 1];
283  const_iterator ithere = begin();
284  const_iterator itend = end();
285  int len = 0;
286
287  while (ithere != itend)
288    {
289      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
290      else {
291    // put a space or a question mark depending on what
292    // the character is. Question marks tell the user that
293    // they are missing some information.
294    if (is_unicode_space (*ithere)) cstr[len] = ' ';
295    else cstr[len] = '?';
296      }
297      len++;
298      ithere++;
299    }
300
301  cstr[len] = '\0';
302
303  return (char *)cstr;
304}
305
306
307// general functions which work on text_ts
308
309// find a character within a range
310text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
311                 unsigned short c)
312{
313  while (first != last)
314    {
315      if (*first == c) break;
316      first++;
317    }
318  return first;
319}
320
321text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
322               unsigned short c)
323{
324  while (first != last)
325    {
326      if (*first == c) break;
327      first++;
328    }
329  return first;
330}
331
332// get a string up to the next delimiter (which is skipped)
333text_t::const_iterator getdelimitstr (text_t::const_iterator first,
334                      text_t::const_iterator last,
335                      unsigned short c, text_t &outstr)
336{
337  text_t::const_iterator here = first;
338  here = findchar (first, last, c);
339  outstr.clear();
340  outstr.appendrange (first, here);
341  if (here != last) here++; // skip c
342  return here;
343}
344
345text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
346                unsigned short c, text_t &outstr)
347{
348  text_t::iterator here = first;
349  here = findchar (first, last, c);
350  outstr.clear();
351  outstr.appendrange (first, here);
352  if (here != last) here++; // skip c
353  return here;
354}
355
356// split a string with a character
357void splitchar (text_t::const_iterator first, text_t::const_iterator last,
358        unsigned short c, text_tset &outlist)
359{
360  outlist.erase(outlist.begin(), outlist.end());
361
362  text_t t;
363
364  while (first != last)
365    {
366      first = getdelimitstr (first, last, c, t);
367      outlist.insert (t);
368    }
369}
370
371void splitchar (text_t::const_iterator first, text_t::const_iterator last,
372        unsigned short c, text_tlist &outlist)
373{
374  outlist.erase(outlist.begin(), outlist.end());
375
376  text_t t;
377
378  while (first != last)
379    {
380      first = getdelimitstr (first, last, c, t);
381      outlist.push_back (t);
382    }
383}
384
385void splitchar (text_t::const_iterator first, text_t::const_iterator last,
386        unsigned short c, text_tarray &outlist)
387{
388  outlist.erase(outlist.begin(), outlist.end());
389
390  text_t t;
391
392  while (first != last)
393    {
394      first = getdelimitstr (first, last, c, t);
395      outlist.push_back (t);
396    }
397}
398
399// join a string using a character
400void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
401{
402  outtext.clear ();
403
404  text_tset::const_iterator here = inlist.begin ();
405  text_tset::const_iterator end = inlist.end ();
406  bool first = true;
407  while (here != end)
408    {
409      if (!first) outtext.push_back (c);
410      first = false;
411      outtext += *here;
412      here++;
413    }
414}
415
416void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
417{
418  outtext.clear ();
419
420  text_tlist::const_iterator here = inlist.begin ();
421  text_tlist::const_iterator end = inlist.end ();
422  bool first = true;
423  while (here != end)
424    {
425      if (!first) outtext.push_back (c);
426      first = false;
427      outtext += *here;
428      here++;
429    }
430}
431
432void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
433{
434  outtext.clear ();
435
436  text_tarray::const_iterator here = inlist.begin ();
437  text_tarray::const_iterator end = inlist.end ();
438  bool first = true;
439  while (here != end)
440    {
441      if (!first) outtext.push_back (c);
442      first = false;
443      outtext += *here;
444      here++;
445    }
446}
447
448void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
449{
450  outtext.clear ();
451
452  text_tlist::const_iterator here = inlist.begin ();
453  text_tlist::const_iterator end = inlist.end ();
454  bool first = true;
455  while (here != end)
456    {
457      if (!first) outtext += c;
458      first = false;
459      outtext += *here;
460      here++;
461    }
462}
463
464void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
465{
466  outtext.clear ();
467
468  text_tset::const_iterator here = inlist.begin ();
469  text_tset::const_iterator end = inlist.end ();
470  bool first = true;
471  while (here != end)
472    {
473      if (!first) outtext += c;
474      first = false;
475      outtext += *here;
476      here++;
477    }
478}
479
480void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
481{
482  outtext.clear ();
483
484  text_tarray::const_iterator here = inlist.begin ();
485  text_tarray::const_iterator end = inlist.end ();
486  bool first = true;
487  while (here != end)
488    {
489      if (!first) outtext += c;
490      first = false;
491      outtext += *here;
492      here++;
493    }
494}
495
496// count the occurances of a character within a range
497int countchar (text_t::const_iterator first, text_t::const_iterator last,
498           unsigned short c)
499{
500  int count = 0;
501  while (first != last) {
502    if (*first == c) count ++;
503    first ++;
504  }
505  return count;
506}
507
508// return a substring of string from first up to but not including last
509text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
510
511  text_t substr;
512  while (first != last) {
513    substr.push_back(*first);
514    first ++;
515  }
516  return substr;
517}
518
519
520// convert to lowercase
521void lc (text_t::iterator first, text_t::iterator last) {
522  while (first != last) {
523    *first = unicode_tolower(*first);
524    first++;
525  }
526}
527
528// convert to uppercase
529void uc (text_t::iterator first, text_t::iterator last) {
530  while (first != last) {
531    *first = unicode_toupper(*first);
532    first++;
533  }
534}
535
536
537// checks to see if it is a number (i.e. contains only 0-9)
538bool is_number (const text_t &text) {
539
540  text_t::const_iterator here = text.begin();
541  text_t::const_iterator end = text.end();
542
543  while (here != end) {
544    if ((*here!='0') && (*here!='1') && (*here!='2') &&
545    (*here!='3') && (*here!='4') && (*here!='5') &&
546    (*here!='6') && (*here!='7') && (*here!='8') &&
547    (*here!='9')) return false;
548    here ++;
549  }
550  return true;
551}
552
553
554// checks to see if the text has any letters or digits
555bool has_unicode_letdig (const text_t &text) {
556  if (text.empty()) return false;
557 
558  text_t::const_iterator here = text.begin();
559  text_t::const_iterator end = text.end();
560  while (here != end) {
561    if (is_unicode_letdig (*here)) return true;
562    here++;
563  }
564
565  return false;
566}
567
568
569
570////////////////////////////////////
571// convertclass methods
572////////////////////////////////////
573
574// conversion classes used for getting information in to and out of
575// the text_t class.
576
577convertclass::convertclass ()
578{
579  // nothing to do
580}
581
582void convertclass::reset ()
583{
584  // nothing to do
585}
586
587
588////////////////////////////////////
589// inconvertclass methods
590////////////////////////////////////
591
592// convert from a char stream to the text_t class
593// the default version assumes the input is a ascii
594// character array
595
596inconvertclass::inconvertclass ()
597{
598  start = NULL;
599  len = 0;
600}
601
602
603void inconvertclass::reset ()
604{
605  start = NULL;
606  len = 0;
607}
608
609void inconvertclass::setinput (char *thestart, size_t thelen)
610{
611  start = thestart;
612  len = thelen;
613}
614
615void inconvertclass::convert (text_t &output, status_t &status)
616{
617  output.clear();
618
619  if (start == NULL || len == 0)
620    {
621      status = finished;
622      return;
623    }
624
625  // don't want any funny sign conversions happening
626  unsigned char *here = (unsigned char *)start;
627  while (len > 0)
628    {
629      output.push_back (*here); // append this character
630      ++here;
631      --len;
632    }
633
634  start = (char *)here; // save current position
635  status = finished;
636}
637
638// will treat the text_t as a 8-bit string and convert
639// it to a 16-bit string using the about convert method.
640text_t inconvertclass::convert (const text_t &t) {
641  text_t out;
642  text_t tmpout;
643  status_t status;
644  text_t::const_iterator here = t.begin();
645  text_t::const_iterator end = t.end();
646  unsigned char cbuf[256];
647  size_t cbuflen = 0;
648 
649  while (here != end) {
650    while (here != end && cbuflen < 256) {
651      cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
652      here++;
653    }
654
655    if (cbuflen > 0) {
656      setinput ((char *)cbuf, cbuflen);
657      status = unfinished;
658      while (status == unfinished) {
659    convert (tmpout, status);
660    out += tmpout;
661      }
662      cbuflen = 0;
663    }
664  }
665
666  out.setencoding (0); // unicode
667
668  return out;
669}
670
671// an instance of the default inconvertclass to do simple
672// conversions. Note that any functions that use this are
673// not reentrant. If a function needs to be reentrant it
674// should declare its own instance.
675inconvertclass ascii2text_t;
676
677
678////////////////////////////////////
679// outconvertclass methods
680////////////////////////////////////
681
682// Convert from a text_t class to a char stream
683// This default version assumes the output is a ascii
684// character array. If you set the output stream you
685// can use this class to output to a stream using the
686// << operator. The << operator can also be conveniently
687// used to set the output stream by doing something like
688//
689// cout << text_t2ascii << text_tstr << anothertext_tstr;
690//
691outconvertclass::outconvertclass ()
692{
693  input = NULL;
694  outs = NULL;
695}
696
697void outconvertclass::reset ()
698{
699  input = NULL;
700  outs = NULL;
701}
702
703void outconvertclass::setinput (text_t *theinput)
704{
705  input = theinput;
706  if (input != NULL) texthere = input->begin();
707}
708
709void outconvertclass::convert (char *output, size_t maxlen,
710              size_t &len, status_t &status)
711{
712  if (input == NULL || output == NULL)
713    {
714      status = finished;
715      return;
716    }
717
718  // don't want any funny sign conversions happening
719  unsigned char *uoutput = (unsigned char *)output;
720  text_t::iterator textend = input->end();
721  len = 0;
722  while ((len < maxlen) && (texthere != textend))
723    {
724      if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
725      else {
726    // put a space or a question mark depending on what
727    // the character is. Question marks tell the user that
728    // they are missing some information.
729    if (is_unicode_space (*texthere)) *uoutput = ' ';
730    else *uoutput = '?';
731      }
732      ++uoutput;
733      ++len;
734      ++texthere;
735    }
736 
737  if (texthere == textend) status = finished;
738  else status = unfinished;
739}
740
741// will convert the 16-bit string to a 8-bit stream
742// and place the result in a text_t. This method uses
743// the above convert function.
744text_t outconvertclass::convert (const text_t &t) {
745  text_t out;
746  unsigned char cbuf[256];
747  size_t cbuflen = 0;
748  status_t status = unfinished;
749
750  setinput ((text_t *)&t); // discard constant
751  while (status == unfinished) {
752    convert ((char *)cbuf, 256, cbuflen, status);
753    out.appendcarr ((char *)cbuf, cbuflen);
754  }
755
756  out.setencoding (1); // other encoding
757 
758  return out;
759}
760
761
762void outconvertclass::setostream (ostream *theouts)
763{
764  outs = theouts;
765}
766
767ostream *outconvertclass::getostream ()
768{
769  return outs;
770}
771
772
773
774
775// an instance of the default outconvertclass to do simple
776// conversions
777outconvertclass text_t2ascii;
778
779
780
781// stream operators for the output class
782
783outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
784{
785  outconverter.setostream(&theouts);
786  return outconverter;
787}
788
789
790#define STREAMBUFSIZE 256
791outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
792{
793  ostream *outstream = outconverter.getostream();
794
795  if (outstream == NULL) return outconverter;
796
797  char outbuf[STREAMBUFSIZE];
798  size_t len;
799  outconvertclass::status_t status = outconvertclass::unfinished;
800
801  // assume that there is no data needing converting
802  // left in the converter
803  outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
804
805  while (status == outconvertclass::unfinished)
806    {
807      outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
808      if (len > 0) outstream->write(outbuf, len);
809    }
810
811  return outconverter;
812}
Note: See TracBrowser for help on using the browser.