source: tags/gsdl-2_21-distribution/gsdl/lib/text_t.cpp@ 1186

Last change on this file since 1186 was 1088, checked in by sjboddie, 24 years ago

added text_t versions of joinchar to work with sets and lists

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.5 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 1088 2000-04-14 02:50:12Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.18 2000/04/14 02:50:12 sjboddie
31 added text_t versions of joinchar to work with sets and lists
32
33 Revision 1.17 2000/04/06 19:58:03 cs025
34 Correcting a correction - reinstated all lib files due to silly
35 CVS confusion.
36
37 Revision 1.15 1999/10/14 22:52:39 sjboddie
38 joinchar can join using text_t string now too
39
40 Revision 1.14 1999/09/24 02:30:03 rjmcnab
41 added function has_unicode_letdig
42
43 Revision 1.13 1999/09/07 04:57:43 sjboddie
44 added gpl notice
45
46 Revision 1.12 1999/08/31 08:04:41 rjmcnab
47 Fixed a small but hard to find bug in getcarr
48
49 Revision 1.11 1999/07/01 04:05:09 rjmcnab
50 Optimised append functions slightly and added a reserve function.
51
52 Revision 1.10 1999/04/26 03:58:03 sjboddie
53 added is_number function
54
55 Revision 1.9 1999/04/06 22:17:24 rjmcnab
56 Added splits and joins using text_tset.
57
58 Revision 1.8 1999/02/28 23:14:41 rjmcnab
59
60 Added uc and lc to convert to uppercase and lowercase.
61
62 Revision 1.7 1999/02/21 22:26:39 rjmcnab
63
64 Made getint() a constant function.
65
66 Revision 1.6 1999/02/03 01:13:26 sjboddie
67
68 Got interface to handle subcollections and language subcollections -
69 committed changes made to some of the collections
70
71 Revision 1.5 1999/01/19 01:38:14 rjmcnab
72
73 Made the source more portable.
74
75 Revision 1.4 1999/01/12 01:51:00 rjmcnab
76
77 Standard header.
78
79 Revision 1.3 1999/01/08 02:33:16 rjmcnab
80
81 Added standard header to source files.
82
83 */
84
85
86#include "text_t.h"
87
88#if defined(GSDL_USE_OBJECTSPACE)
89# include <ospace\std\algorithm>
90#elif defined(GSDL_USE_STL_H)
91# if defined(GSDL_USE_ALGO_H)
92# include <algo.h>
93# else
94# include <algorithm.h>
95# endif
96#else
97# include <algorithm>
98#endif
99
100
101#include "unitool.h"
102
103////////////////////////////////////
104// text_t methods
105////////////////////////////////////
106
107text_t::text_t ()
108{
109 setencoding(0);
110 clear ();
111}
112
113text_t::text_t (int i)
114{
115 setencoding(0);
116 clear ();
117 appendint (i);
118}
119
120text_t::text_t (char *s)
121{
122 setencoding(0);
123 clear ();
124 appendcstr (s);
125}
126
127void text_t::append (const text_t &t)
128{
129 text.insert(text.end(), t.begin(), t.end());
130 // const_iterator here, end=t.end();
131 // for (here=t.begin(); here!=end;here++)
132 // {
133 // text.push_back(*here);
134 // }
135}
136
137void text_t::appendrange (iterator first, iterator last)
138{
139 text.insert(text.end(), first, last);
140 // while (first != last)
141 // {
142 // text.push_back (*first);
143 // first++;
144 // }
145}
146
147void text_t::appendrange (const_iterator first, const_iterator last)
148{
149 text.insert(text.end(), first, last);
150 // while (first != last)
151 // {
152 // text.push_back (*first);
153 // first++;
154 // }
155}
156
157void text_t::appendint (int i)
158{
159 // deal with zeros and negatives
160 if (i == 0)
161 {
162 text.push_back('0');
163 return;
164 }
165 else if (i < 0)
166 {
167 text.push_back('-');
168 i *= -1;
169 }
170
171 // get a buffer for the conversion
172 int maxbuflen = sizeof(int)*3;
173 char *buf = new char[maxbuflen];
174 int len = 0;
175
176 // get the number in reverse
177 while (i > 0)
178 {
179 buf[len++] = '0'+ (i%10);
180 i = i/10;
181 }
182
183 // reverse the number
184 while (len > 0)
185 {
186 text.push_back(buf[--len]);
187 }
188
189 delete buf;
190}
191
192int text_t::getint () const
193{
194 int i = 0;
195 int mult = 1; // become -1 for negative numbers
196
197 const_iterator here = text.begin();
198 const_iterator end = text.end();
199
200 // do plus and minus signs
201 if (here != end)
202 {
203 if (*here == '-')
204 {
205 mult = -1;
206 here++;
207 }
208 else if (*here == '+')
209 {
210 mult = 1;
211 here++;
212 }
213 }
214
215 // deal with the number
216 while ((here != end) && (*here >= '0') && (*here <= '9'))
217 {
218 i = 10*i + (*here - '0');
219 here++;
220 }
221
222 i *= mult;
223 return i;
224}
225
226
227
228void text_t::appendcarr (char *s, size_type len)
229{
230 unsigned char *us = (unsigned char *)s;
231 while (len > 0)
232 {
233 text.push_back (*us); // append this character
234 us++;
235 len--;
236 }
237}
238
239void text_t::appendcstr (char *s)
240{
241 unsigned char *us = (unsigned char *)s;
242 while (*us != '\0')
243 {
244 text.push_back (*us); // append this character
245 us++;
246 }
247}
248
249
250// strings returned from getcarr and getcstr become the callers
251// responsibility and should be deallocated with "delete"
252
253char *text_t::getcarr(size_type &len) const
254{
255 unsigned char *cstr = new unsigned char[size()];
256 len = 0;
257
258 const_iterator ithere = begin();
259 const_iterator itend = end();
260 while (ithere != itend)
261 {
262 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
263 else {
264 // put a space or a question mark depending on what
265 // the character is. Question marks tell the user that
266 // they are missing some information.
267 if (is_unicode_space (*ithere)) cstr[len] = ' ';
268 else cstr[len] = '?';
269 }
270 len++;
271 ithere++;
272 }
273
274 return (char *)cstr;
275}
276
277char *text_t::getcstr() const
278{
279 unsigned char *cstr = new unsigned char[size() + 1];
280 const_iterator ithere = begin();
281 const_iterator itend = end();
282 int len = 0;
283
284 while (ithere != itend)
285 {
286 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
287 else {
288 // put a space or a question mark depending on what
289 // the character is. Question marks tell the user that
290 // they are missing some information.
291 if (is_unicode_space (*ithere)) cstr[len] = ' ';
292 else cstr[len] = '?';
293 }
294 len++;
295 ithere++;
296 }
297
298 cstr[len] = '\0';
299
300 return (char *)cstr;
301}
302
303
304// general functions which work on text_ts
305
306// find a character within a range
307text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
308 unsigned short c)
309{
310 while (first != last)
311 {
312 if (*first == c) break;
313 first++;
314 }
315 return first;
316}
317
318text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
319 unsigned short c)
320{
321 while (first != last)
322 {
323 if (*first == c) break;
324 first++;
325 }
326 return first;
327}
328
329// get a string up to the next delimiter (which is skipped)
330text_t::const_iterator getdelimitstr (text_t::const_iterator first,
331 text_t::const_iterator last,
332 unsigned short c, text_t &outstr)
333{
334 text_t::const_iterator here = first;
335 here = findchar (first, last, c);
336 outstr.clear();
337 outstr.appendrange (first, here);
338 if (here != last) here++; // skip c
339 return here;
340}
341
342text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
343 unsigned short c, text_t &outstr)
344{
345 text_t::iterator here = first;
346 here = findchar (first, last, c);
347 outstr.clear();
348 outstr.appendrange (first, here);
349 if (here != last) here++; // skip c
350 return here;
351}
352
353// split a string with a character
354void splitchar (text_t::const_iterator first, text_t::const_iterator last,
355 unsigned short c, text_tset &outlist)
356{
357 outlist.erase(outlist.begin(), outlist.end());
358
359 text_t t;
360
361 while (first != last)
362 {
363 first = getdelimitstr (first, last, c, t);
364 outlist.insert (t);
365 }
366}
367
368void splitchar (text_t::const_iterator first, text_t::const_iterator last,
369 unsigned short c, text_tlist &outlist)
370{
371 outlist.erase(outlist.begin(), outlist.end());
372
373 text_t t;
374
375 while (first != last)
376 {
377 first = getdelimitstr (first, last, c, t);
378 outlist.push_back (t);
379 }
380}
381
382void splitchar (text_t::const_iterator first, text_t::const_iterator last,
383 unsigned short c, text_tarray &outlist)
384{
385 outlist.erase(outlist.begin(), outlist.end());
386
387 text_t t;
388
389 while (first != last)
390 {
391 first = getdelimitstr (first, last, c, t);
392 outlist.push_back (t);
393 }
394}
395
396// join a string using a character
397void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
398{
399 outtext.clear ();
400
401 text_tset::const_iterator here = inlist.begin ();
402 text_tset::const_iterator end = inlist.end ();
403 bool first = true;
404 while (here != end)
405 {
406 if (!first) outtext.push_back (c);
407 first = false;
408 outtext += *here;
409 here++;
410 }
411}
412
413void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
414{
415 outtext.clear ();
416
417 text_tlist::const_iterator here = inlist.begin ();
418 text_tlist::const_iterator end = inlist.end ();
419 bool first = true;
420 while (here != end)
421 {
422 if (!first) outtext.push_back (c);
423 first = false;
424 outtext += *here;
425 here++;
426 }
427}
428
429void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
430{
431 outtext.clear ();
432
433 text_tarray::const_iterator here = inlist.begin ();
434 text_tarray::const_iterator end = inlist.end ();
435 bool first = true;
436 while (here != end)
437 {
438 if (!first) outtext.push_back (c);
439 first = false;
440 outtext += *here;
441 here++;
442 }
443}
444
445void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
446{
447 outtext.clear ();
448
449 text_tlist::const_iterator here = inlist.begin ();
450 text_tlist::const_iterator end = inlist.end ();
451 bool first = true;
452 while (here != end)
453 {
454 if (!first) outtext += c;
455 first = false;
456 outtext += *here;
457 here++;
458 }
459}
460
461void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
462{
463 outtext.clear ();
464
465 text_tset::const_iterator here = inlist.begin ();
466 text_tset::const_iterator end = inlist.end ();
467 bool first = true;
468 while (here != end)
469 {
470 if (!first) outtext += c;
471 first = false;
472 outtext += *here;
473 here++;
474 }
475}
476
477void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
478{
479 outtext.clear ();
480
481 text_tarray::const_iterator here = inlist.begin ();
482 text_tarray::const_iterator end = inlist.end ();
483 bool first = true;
484 while (here != end)
485 {
486 if (!first) outtext += c;
487 first = false;
488 outtext += *here;
489 here++;
490 }
491}
492
493// count the occurances of a character within a range
494int countchar (text_t::const_iterator first, text_t::const_iterator last,
495 unsigned short c)
496{
497 int count = 0;
498 while (first != last) {
499 if (*first == c) count ++;
500 first ++;
501 }
502 return count;
503}
504
505// return a substring of string from first up to but not including last
506text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
507
508 text_t substr;
509 while (first != last) {
510 substr.push_back(*first);
511 first ++;
512 }
513 return substr;
514}
515
516
517// convert to lowercase
518void lc (text_t::iterator first, text_t::iterator last) {
519 while (first != last) {
520 *first = unicode_tolower(*first);
521 first++;
522 }
523}
524
525// convert to uppercase
526void uc (text_t::iterator first, text_t::iterator last) {
527 while (first != last) {
528 *first = unicode_toupper(*first);
529 first++;
530 }
531}
532
533
534// checks to see if it is a number (i.e. contains only 0-9)
535bool is_number (const text_t &text) {
536
537 text_t::const_iterator here = text.begin();
538 text_t::const_iterator end = text.end();
539
540 while (here != end) {
541 if ((*here!='0') && (*here!='1') && (*here!='2') &&
542 (*here!='3') && (*here!='4') && (*here!='5') &&
543 (*here!='6') && (*here!='7') && (*here!='8') &&
544 (*here!='9')) return false;
545 here ++;
546 }
547 return true;
548}
549
550
551// checks to see if the text has any letters or digits
552bool has_unicode_letdig (const text_t &text) {
553 if (text.empty()) return false;
554
555 text_t::const_iterator here = text.begin();
556 text_t::const_iterator end = text.end();
557 while (here != end) {
558 if (is_unicode_letdig (*here)) return true;
559 here++;
560 }
561
562 return false;
563}
564
565
566
567////////////////////////////////////
568// convertclass methods
569////////////////////////////////////
570
571// conversion classes used for getting information in to and out of
572// the text_t class.
573
574convertclass::convertclass ()
575{
576 // nothing to do
577}
578
579void convertclass::reset ()
580{
581 // nothing to do
582}
583
584
585////////////////////////////////////
586// inconvertclass methods
587////////////////////////////////////
588
589// convert from a char stream to the text_t class
590// the default version assumes the input is a ascii
591// character array
592
593inconvertclass::inconvertclass ()
594{
595 start = NULL;
596 len = 0;
597}
598
599
600void inconvertclass::reset ()
601{
602 start = NULL;
603 len = 0;
604}
605
606void inconvertclass::setinput (char *thestart, size_t thelen)
607{
608 start = thestart;
609 len = thelen;
610}
611
612void inconvertclass::convert (text_t &output, status_t &status)
613{
614 output.clear();
615
616 if (start == NULL || len == 0)
617 {
618 status = finished;
619 return;
620 }
621
622 // don't want any funny sign conversions happening
623 unsigned char *here = (unsigned char *)start;
624 while (len > 0)
625 {
626 output.push_back (*here); // append this character
627 ++here;
628 --len;
629 }
630
631 start = (char *)here; // save current position
632 status = finished;
633}
634
635// will treat the text_t as a 8-bit string and convert
636// it to a 16-bit string using the about convert method.
637text_t inconvertclass::convert (const text_t &t) {
638 text_t out;
639 text_t tmpout;
640 status_t status;
641 text_t::const_iterator here = t.begin();
642 text_t::const_iterator end = t.end();
643 unsigned char cbuf[256];
644 size_t cbuflen = 0;
645
646 while (here != end) {
647 while (here != end && cbuflen < 256) {
648 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
649 here++;
650 }
651
652 if (cbuflen > 0) {
653 setinput ((char *)cbuf, cbuflen);
654 status = unfinished;
655 while (status == unfinished) {
656 convert (tmpout, status);
657 out += tmpout;
658 }
659 cbuflen = 0;
660 }
661 }
662
663 out.setencoding (0); // unicode
664
665 return out;
666}
667
668// an instance of the default inconvertclass to do simple
669// conversions. Note that any functions that use this are
670// not reentrant. If a function needs to be reentrant it
671// should declare its own instance.
672inconvertclass ascii2text_t;
673
674
675////////////////////////////////////
676// outconvertclass methods
677////////////////////////////////////
678
679// Convert from a text_t class to a char stream
680// This default version assumes the output is a ascii
681// character array. If you set the output stream you
682// can use this class to output to a stream using the
683// << operator. The << operator can also be conveniently
684// used to set the output stream by doing something like
685//
686// cout << text_t2ascii << text_tstr << anothertext_tstr;
687//
688outconvertclass::outconvertclass ()
689{
690 input = NULL;
691 outs = NULL;
692}
693
694void outconvertclass::reset ()
695{
696 input = NULL;
697 outs = NULL;
698}
699
700void outconvertclass::setinput (text_t *theinput)
701{
702 input = theinput;
703 if (input != NULL) texthere = input->begin();
704}
705
706void outconvertclass::convert (char *output, size_t maxlen,
707 size_t &len, status_t &status)
708{
709 if (input == NULL || output == NULL)
710 {
711 status = finished;
712 return;
713 }
714
715 // don't want any funny sign conversions happening
716 unsigned char *uoutput = (unsigned char *)output;
717 text_t::iterator textend = input->end();
718 len = 0;
719 while ((len < maxlen) && (texthere != textend))
720 {
721 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
722 else {
723 // put a space or a question mark depending on what
724 // the character is. Question marks tell the user that
725 // they are missing some information.
726 if (is_unicode_space (*texthere)) *uoutput = ' ';
727 else *uoutput = '?';
728 }
729 ++uoutput;
730 ++len;
731 ++texthere;
732 }
733
734 if (texthere == textend) status = finished;
735 else status = unfinished;
736}
737
738// will convert the 16-bit string to a 8-bit stream
739// and place the result in a text_t. This method uses
740// the above convert function.
741text_t outconvertclass::convert (const text_t &t) {
742 text_t out;
743 unsigned char cbuf[256];
744 size_t cbuflen = 0;
745 status_t status = unfinished;
746
747 setinput ((text_t *)&t); // discard constant
748 while (status == unfinished) {
749 convert ((char *)cbuf, 256, cbuflen, status);
750 out.appendcarr ((char *)cbuf, cbuflen);
751 }
752
753 out.setencoding (1); // other encoding
754
755 return out;
756}
757
758
759void outconvertclass::setostream (ostream *theouts)
760{
761 outs = theouts;
762}
763
764ostream *outconvertclass::getostream ()
765{
766 return outs;
767}
768
769
770
771
772// an instance of the default outconvertclass to do simple
773// conversions
774outconvertclass text_t2ascii;
775
776
777
778// stream operators for the output class
779
780outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
781{
782 outconverter.setostream(&theouts);
783 return outconverter;
784}
785
786
787#define STREAMBUFSIZE 256
788outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
789{
790 ostream *outstream = outconverter.getostream();
791
792 if (outstream == NULL) return outconverter;
793
794 char outbuf[STREAMBUFSIZE];
795 size_t len;
796 outconvertclass::status_t status = outconvertclass::unfinished;
797
798 // assume that there is no data needing converting
799 // left in the converter
800 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
801
802 while (status == outconvertclass::unfinished)
803 {
804 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
805 if (len > 0) outstream->write(outbuf, len);
806 }
807
808 return outconverter;
809}
Note: See TracBrowser for help on using the repository browser.