source: main/tags/2.25/gsdl/lib/text_t.cpp@ 25562

Last change on this file since 25562 was 1310, checked in by sjboddie, 24 years ago

Removed CVS logging information from source files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.0 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "text_t.h"
27
28#if defined(GSDL_USE_OBJECTSPACE)
29# include <ospace\std\algorithm>
30#elif defined(GSDL_USE_STL_H)
31# if defined(GSDL_USE_ALGO_H)
32# include <algo.h>
33# else
34# include <algorithm.h>
35# endif
36#else
37# include <algorithm>
38#endif
39
40
41#include "unitool.h"
42
43////////////////////////////////////
44// text_t methods
45////////////////////////////////////
46
47text_t::text_t ()
48{
49 setencoding(0);
50 clear ();
51}
52
53text_t::text_t (int i)
54{
55 setencoding(0);
56 clear ();
57 appendint (i);
58}
59
60text_t::text_t (char *s)
61{
62 setencoding(0);
63 clear ();
64 appendcstr (s);
65}
66
67void text_t::append (const text_t &t)
68{
69 text.insert(text.end(), t.begin(), t.end());
70 // const_iterator here, end=t.end();
71 // for (here=t.begin(); here!=end;here++)
72 // {
73 // text.push_back(*here);
74 // }
75}
76
77void text_t::appendrange (iterator first, iterator last)
78{
79 text.insert(text.end(), first, last);
80 // while (first != last)
81 // {
82 // text.push_back (*first);
83 // first++;
84 // }
85}
86
87void text_t::appendrange (const_iterator first, const_iterator last)
88{
89 text.insert(text.end(), first, last);
90 // while (first != last)
91 // {
92 // text.push_back (*first);
93 // first++;
94 // }
95}
96
97void text_t::appendint (int i)
98{
99 // deal with zeros and negatives
100 if (i == 0)
101 {
102 text.push_back('0');
103 return;
104 }
105 else if (i < 0)
106 {
107 text.push_back('-');
108 i *= -1;
109 }
110
111 // get a buffer for the conversion
112 int maxbuflen = sizeof(int)*3;
113 char *buf = new char[maxbuflen];
114 int len = 0;
115
116 // get the number in reverse
117 while (i > 0)
118 {
119 buf[len++] = '0'+ (i%10);
120 i = i/10;
121 }
122
123 // reverse the number
124 while (len > 0)
125 {
126 text.push_back(buf[--len]);
127 }
128
129 delete buf;
130}
131
132int text_t::getint () const
133{
134 int i = 0;
135 int mult = 1; // become -1 for negative numbers
136
137 const_iterator here = text.begin();
138 const_iterator end = text.end();
139
140 // do plus and minus signs
141 if (here != end)
142 {
143 if (*here == '-')
144 {
145 mult = -1;
146 here++;
147 }
148 else if (*here == '+')
149 {
150 mult = 1;
151 here++;
152 }
153 }
154
155 // deal with the number
156 while ((here != end) && (*here >= '0') && (*here <= '9'))
157 {
158 i = 10*i + (*here - '0');
159 here++;
160 }
161
162 i *= mult;
163 return i;
164}
165
166
167
168void text_t::appendcarr (char *s, size_type len)
169{
170 unsigned char *us = (unsigned char *)s;
171 while (len > 0)
172 {
173 text.push_back (*us); // append this character
174 us++;
175 len--;
176 }
177}
178
179void text_t::appendcstr (char *s)
180{
181 unsigned char *us = (unsigned char *)s;
182 while (*us != '\0')
183 {
184 text.push_back (*us); // append this character
185 us++;
186 }
187}
188
189
190// strings returned from getcarr and getcstr become the callers
191// responsibility and should be deallocated with "delete"
192
193char *text_t::getcarr(size_type &len) const
194{
195 unsigned char *cstr = new unsigned char[size()];
196 len = 0;
197
198 const_iterator ithere = begin();
199 const_iterator itend = end();
200 while (ithere != itend)
201 {
202 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
203 else {
204 // put a space or a question mark depending on what
205 // the character is. Question marks tell the user that
206 // they are missing some information.
207 if (is_unicode_space (*ithere)) cstr[len] = ' ';
208 else cstr[len] = '?';
209 }
210 len++;
211 ithere++;
212 }
213
214 return (char *)cstr;
215}
216
217char *text_t::getcstr() const
218{
219 unsigned char *cstr = new unsigned char[size() + 1];
220 const_iterator ithere = begin();
221 const_iterator itend = end();
222 int len = 0;
223
224 while (ithere != itend)
225 {
226 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
227 else {
228 // put a space or a question mark depending on what
229 // the character is. Question marks tell the user that
230 // they are missing some information.
231 if (is_unicode_space (*ithere)) cstr[len] = ' ';
232 else cstr[len] = '?';
233 }
234 len++;
235 ithere++;
236 }
237
238 cstr[len] = '\0';
239
240 return (char *)cstr;
241}
242
243
244// general functions which work on text_ts
245
246// find a character within a range
247text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
248 unsigned short c)
249{
250 while (first != last)
251 {
252 if (*first == c) break;
253 first++;
254 }
255 return first;
256}
257
258text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
259 unsigned short c)
260{
261 while (first != last)
262 {
263 if (*first == c) break;
264 first++;
265 }
266 return first;
267}
268
269// get a string up to the next delimiter (which is skipped)
270text_t::const_iterator getdelimitstr (text_t::const_iterator first,
271 text_t::const_iterator last,
272 unsigned short c, text_t &outstr)
273{
274 text_t::const_iterator here = first;
275 here = findchar (first, last, c);
276 outstr.clear();
277 outstr.appendrange (first, here);
278 if (here != last) here++; // skip c
279 return here;
280}
281
282text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
283 unsigned short c, text_t &outstr)
284{
285 text_t::iterator here = first;
286 here = findchar (first, last, c);
287 outstr.clear();
288 outstr.appendrange (first, here);
289 if (here != last) here++; // skip c
290 return here;
291}
292
293// split a string with a character
294void splitchar (text_t::const_iterator first, text_t::const_iterator last,
295 unsigned short c, text_tset &outlist)
296{
297 outlist.erase(outlist.begin(), outlist.end());
298
299 text_t t;
300
301 while (first != last)
302 {
303 first = getdelimitstr (first, last, c, t);
304 outlist.insert (t);
305 }
306}
307
308void splitchar (text_t::const_iterator first, text_t::const_iterator last,
309 unsigned short c, text_tlist &outlist)
310{
311 outlist.erase(outlist.begin(), outlist.end());
312
313 text_t t;
314
315 while (first != last)
316 {
317 first = getdelimitstr (first, last, c, t);
318 outlist.push_back (t);
319 }
320}
321
322void splitchar (text_t::const_iterator first, text_t::const_iterator last,
323 unsigned short c, text_tarray &outlist)
324{
325 outlist.erase(outlist.begin(), outlist.end());
326
327 text_t t;
328
329 while (first != last)
330 {
331 first = getdelimitstr (first, last, c, t);
332 outlist.push_back (t);
333 }
334}
335
336// join a string using a character
337void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
338{
339 outtext.clear ();
340
341 text_tset::const_iterator here = inlist.begin ();
342 text_tset::const_iterator end = inlist.end ();
343 bool first = true;
344 while (here != end)
345 {
346 if (!first) outtext.push_back (c);
347 first = false;
348 outtext += *here;
349 here++;
350 }
351}
352
353void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
354{
355 outtext.clear ();
356
357 text_tlist::const_iterator here = inlist.begin ();
358 text_tlist::const_iterator end = inlist.end ();
359 bool first = true;
360 while (here != end)
361 {
362 if (!first) outtext.push_back (c);
363 first = false;
364 outtext += *here;
365 here++;
366 }
367}
368
369void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
370{
371 outtext.clear ();
372
373 text_tarray::const_iterator here = inlist.begin ();
374 text_tarray::const_iterator end = inlist.end ();
375 bool first = true;
376 while (here != end)
377 {
378 if (!first) outtext.push_back (c);
379 first = false;
380 outtext += *here;
381 here++;
382 }
383}
384
385void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
386{
387 outtext.clear ();
388
389 text_tlist::const_iterator here = inlist.begin ();
390 text_tlist::const_iterator end = inlist.end ();
391 bool first = true;
392 while (here != end)
393 {
394 if (!first) outtext += c;
395 first = false;
396 outtext += *here;
397 here++;
398 }
399}
400
401void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
402{
403 outtext.clear ();
404
405 text_tset::const_iterator here = inlist.begin ();
406 text_tset::const_iterator end = inlist.end ();
407 bool first = true;
408 while (here != end)
409 {
410 if (!first) outtext += c;
411 first = false;
412 outtext += *here;
413 here++;
414 }
415}
416
417void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
418{
419 outtext.clear ();
420
421 text_tarray::const_iterator here = inlist.begin ();
422 text_tarray::const_iterator end = inlist.end ();
423 bool first = true;
424 while (here != end)
425 {
426 if (!first) outtext += c;
427 first = false;
428 outtext += *here;
429 here++;
430 }
431}
432
433// count the occurances of a character within a range
434int countchar (text_t::const_iterator first, text_t::const_iterator last,
435 unsigned short c)
436{
437 int count = 0;
438 while (first != last) {
439 if (*first == c) count ++;
440 first ++;
441 }
442 return count;
443}
444
445// return a substring of string from first up to but not including last
446text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
447
448 text_t substr;
449 while (first != last) {
450 substr.push_back(*first);
451 first ++;
452 }
453 return substr;
454}
455
456
457// convert to lowercase
458void lc (text_t::iterator first, text_t::iterator last) {
459 while (first != last) {
460 *first = unicode_tolower(*first);
461 first++;
462 }
463}
464
465// convert to uppercase
466void uc (text_t::iterator first, text_t::iterator last) {
467 while (first != last) {
468 *first = unicode_toupper(*first);
469 first++;
470 }
471}
472
473
474// checks to see if it is a number (i.e. contains only 0-9)
475bool is_number (const text_t &text) {
476
477 text_t::const_iterator here = text.begin();
478 text_t::const_iterator end = text.end();
479
480 while (here != end) {
481 if ((*here!='0') && (*here!='1') && (*here!='2') &&
482 (*here!='3') && (*here!='4') && (*here!='5') &&
483 (*here!='6') && (*here!='7') && (*here!='8') &&
484 (*here!='9')) return false;
485 here ++;
486 }
487 return true;
488}
489
490
491// checks to see if the text has any letters or digits
492bool has_unicode_letdig (const text_t &text) {
493 if (text.empty()) return false;
494
495 text_t::const_iterator here = text.begin();
496 text_t::const_iterator end = text.end();
497 while (here != end) {
498 if (is_unicode_letdig (*here)) return true;
499 here++;
500 }
501
502 return false;
503}
504
505
506
507////////////////////////////////////
508// convertclass methods
509////////////////////////////////////
510
511// conversion classes used for getting information in to and out of
512// the text_t class.
513
514convertclass::convertclass ()
515{
516 // nothing to do
517}
518
519void convertclass::reset ()
520{
521 // nothing to do
522}
523
524
525////////////////////////////////////
526// inconvertclass methods
527////////////////////////////////////
528
529// convert from a char stream to the text_t class
530// the default version assumes the input is a ascii
531// character array
532
533inconvertclass::inconvertclass ()
534{
535 start = NULL;
536 len = 0;
537}
538
539
540void inconvertclass::reset ()
541{
542 start = NULL;
543 len = 0;
544}
545
546void inconvertclass::setinput (char *thestart, size_t thelen)
547{
548 start = thestart;
549 len = thelen;
550}
551
552void inconvertclass::convert (text_t &output, status_t &status)
553{
554 output.clear();
555
556 if (start == NULL || len == 0)
557 {
558 status = finished;
559 return;
560 }
561
562 // don't want any funny sign conversions happening
563 unsigned char *here = (unsigned char *)start;
564 while (len > 0)
565 {
566 output.push_back (*here); // append this character
567 ++here;
568 --len;
569 }
570
571 start = (char *)here; // save current position
572 status = finished;
573}
574
575// will treat the text_t as a 8-bit string and convert
576// it to a 16-bit string using the about convert method.
577text_t inconvertclass::convert (const text_t &t) {
578 text_t out;
579 text_t tmpout;
580 status_t status;
581 text_t::const_iterator here = t.begin();
582 text_t::const_iterator end = t.end();
583 unsigned char cbuf[256];
584 size_t cbuflen = 0;
585
586 while (here != end) {
587 while (here != end && cbuflen < 256) {
588 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
589 here++;
590 }
591
592 if (cbuflen > 0) {
593 setinput ((char *)cbuf, cbuflen);
594 status = unfinished;
595 while (status == unfinished) {
596 convert (tmpout, status);
597 out += tmpout;
598 }
599 cbuflen = 0;
600 }
601 }
602
603 out.setencoding (0); // unicode
604
605 return out;
606}
607
608// an instance of the default inconvertclass to do simple
609// conversions. Note that any functions that use this are
610// not reentrant. If a function needs to be reentrant it
611// should declare its own instance.
612inconvertclass ascii2text_t;
613
614
615////////////////////////////////////
616// outconvertclass methods
617////////////////////////////////////
618
619// Convert from a text_t class to a char stream
620// This default version assumes the output is a ascii
621// character array. If you set the output stream you
622// can use this class to output to a stream using the
623// << operator. The << operator can also be conveniently
624// used to set the output stream by doing something like
625//
626// cout << text_t2ascii << text_tstr << anothertext_tstr;
627//
628outconvertclass::outconvertclass ()
629{
630 input = NULL;
631 outs = NULL;
632}
633
634void outconvertclass::reset ()
635{
636 input = NULL;
637 outs = NULL;
638}
639
640void outconvertclass::setinput (text_t *theinput)
641{
642 input = theinput;
643 if (input != NULL) texthere = input->begin();
644}
645
646void outconvertclass::convert (char *output, size_t maxlen,
647 size_t &len, status_t &status)
648{
649 if (input == NULL || output == NULL)
650 {
651 status = finished;
652 return;
653 }
654
655 // don't want any funny sign conversions happening
656 unsigned char *uoutput = (unsigned char *)output;
657 text_t::iterator textend = input->end();
658 len = 0;
659 while ((len < maxlen) && (texthere != textend))
660 {
661 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
662 else {
663 // put a space or a question mark depending on what
664 // the character is. Question marks tell the user that
665 // they are missing some information.
666 if (is_unicode_space (*texthere)) *uoutput = ' ';
667 else *uoutput = '?';
668 }
669 ++uoutput;
670 ++len;
671 ++texthere;
672 }
673
674 if (texthere == textend) status = finished;
675 else status = unfinished;
676}
677
678// will convert the 16-bit string to a 8-bit stream
679// and place the result in a text_t. This method uses
680// the above convert function.
681text_t outconvertclass::convert (const text_t &t) {
682 text_t out;
683 unsigned char cbuf[256];
684 size_t cbuflen = 0;
685 status_t status = unfinished;
686
687 setinput ((text_t *)&t); // discard constant
688 while (status == unfinished) {
689 convert ((char *)cbuf, 256, cbuflen, status);
690 out.appendcarr ((char *)cbuf, cbuflen);
691 }
692
693 out.setencoding (1); // other encoding
694
695 return out;
696}
697
698
699void outconvertclass::setostream (ostream *theouts)
700{
701 outs = theouts;
702}
703
704ostream *outconvertclass::getostream ()
705{
706 return outs;
707}
708
709
710
711
712// an instance of the default outconvertclass to do simple
713// conversions
714outconvertclass text_t2ascii;
715
716
717
718// stream operators for the output class
719
720outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
721{
722 outconverter.setostream(&theouts);
723 return outconverter;
724}
725
726
727#define STREAMBUFSIZE 256
728outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
729{
730 ostream *outstream = outconverter.getostream();
731
732 if (outstream == NULL) return outconverter;
733
734 char outbuf[STREAMBUFSIZE];
735 size_t len;
736 outconvertclass::status_t status = outconvertclass::unfinished;
737
738 // assume that there is no data needing converting
739 // left in the converter
740 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
741
742 while (status == outconvertclass::unfinished)
743 {
744 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
745 if (len > 0) outstream->write(outbuf, len);
746 }
747
748 return outconverter;
749}
Note: See TracBrowser for help on using the repository browser.