source: main/tags/2.13/gsdl/lib/text_t.cpp@ 24552

Last change on this file since 24552 was 665, checked in by sjboddie, 25 years ago

joinchar can join using text_t string now too

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 665 1999-10-14 22:52:39Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.15 1999/10/14 22:52:39 sjboddie
31 joinchar can join using text_t string now too
32
33 Revision 1.14 1999/09/24 02:30:03 rjmcnab
34 added function has_unicode_letdig
35
36 Revision 1.13 1999/09/07 04:57:43 sjboddie
37 added gpl notice
38
39 Revision 1.12 1999/08/31 08:04:41 rjmcnab
40 Fixed a small but hard to find bug in getcarr
41
42 Revision 1.11 1999/07/01 04:05:09 rjmcnab
43 Optimised append functions slightly and added a reserve function.
44
45 Revision 1.10 1999/04/26 03:58:03 sjboddie
46 added is_number function
47
48 Revision 1.9 1999/04/06 22:17:24 rjmcnab
49 Added splits and joins using text_tset.
50
51 Revision 1.8 1999/02/28 23:14:41 rjmcnab
52
53 Added uc and lc to convert to uppercase and lowercase.
54
55 Revision 1.7 1999/02/21 22:26:39 rjmcnab
56
57 Made getint() a constant function.
58
59 Revision 1.6 1999/02/03 01:13:26 sjboddie
60
61 Got interface to handle subcollections and language subcollections -
62 committed changes made to some of the collections
63
64 Revision 1.5 1999/01/19 01:38:14 rjmcnab
65
66 Made the source more portable.
67
68 Revision 1.4 1999/01/12 01:51:00 rjmcnab
69
70 Standard header.
71
72 Revision 1.3 1999/01/08 02:33:16 rjmcnab
73
74 Added standard header to source files.
75
76 */
77
78
79#include "text_t.h"
80
81#if defined(GSDL_USE_OBJECTSPACE)
82# include <ospace\std\algorithm>
83#elif defined(GSDL_USE_STL_H)
84# if defined(GSDL_USE_ALGO_H)
85# include <algo.h>
86# else
87# include <algorithm.h>
88# endif
89#else
90# include <algorithm>
91#endif
92
93
94#include "unitool.h"
95
96////////////////////////////////////
97// text_t methods
98////////////////////////////////////
99
100text_t::text_t ()
101{
102 setencoding(0);
103 clear ();
104}
105
106text_t::text_t (int i)
107{
108 setencoding(0);
109 clear ();
110 appendint (i);
111}
112
113text_t::text_t (char *s)
114{
115 setencoding(0);
116 clear ();
117 appendcstr (s);
118}
119
120void text_t::append (const text_t &t)
121{
122 text.insert(text.end(), t.begin(), t.end());
123 // const_iterator here, end=t.end();
124 // for (here=t.begin(); here!=end;here++)
125 // {
126 // text.push_back(*here);
127 // }
128}
129
130void text_t::appendrange (iterator first, iterator last)
131{
132 text.insert(text.end(), first, last);
133 // while (first != last)
134 // {
135 // text.push_back (*first);
136 // first++;
137 // }
138}
139
140void text_t::appendrange (const_iterator first, const_iterator last)
141{
142 text.insert(text.end(), first, last);
143 // while (first != last)
144 // {
145 // text.push_back (*first);
146 // first++;
147 // }
148}
149
150void text_t::appendint (int i)
151{
152 // deal with zeros and negatives
153 if (i == 0)
154 {
155 text.push_back('0');
156 return;
157 }
158 else if (i < 0)
159 {
160 text.push_back('-');
161 i *= -1;
162 }
163
164 // get a buffer for the conversion
165 int maxbuflen = sizeof(int)*3;
166 char *buf = new char[maxbuflen];
167 int len = 0;
168
169 // get the number in reverse
170 while (i > 0)
171 {
172 buf[len++] = '0'+ (i%10);
173 i = i/10;
174 }
175
176 // reverse the number
177 while (len > 0)
178 {
179 text.push_back(buf[--len]);
180 }
181
182 delete buf;
183}
184
185int text_t::getint () const
186{
187 int i = 0;
188 int mult = 1; // become -1 for negative numbers
189
190 const_iterator here = text.begin();
191 const_iterator end = text.end();
192
193 // do plus and minus signs
194 if (here != end)
195 {
196 if (*here == '-')
197 {
198 mult = -1;
199 here++;
200 }
201 else if (*here == '+')
202 {
203 mult = 1;
204 here++;
205 }
206 }
207
208 // deal with the number
209 while ((here != end) && (*here >= '0') && (*here <= '9'))
210 {
211 i = 10*i + (*here - '0');
212 here++;
213 }
214
215 i *= mult;
216 return i;
217}
218
219
220
221void text_t::appendcarr (char *s, size_type len)
222{
223 unsigned char *us = (unsigned char *)s;
224 while (len > 0)
225 {
226 text.push_back (*us); // append this character
227 us++;
228 len--;
229 }
230}
231
232void text_t::appendcstr (char *s)
233{
234 unsigned char *us = (unsigned char *)s;
235 while (*us != '\0')
236 {
237 text.push_back (*us); // append this character
238 us++;
239 }
240}
241
242
243// strings returned from getcarr and getcstr become the callers
244// responsibility and should be deallocated with "delete"
245
246char *text_t::getcarr(size_type &len) const
247{
248 unsigned char *cstr = new unsigned char[size()];
249 len = 0;
250
251 const_iterator ithere = begin();
252 const_iterator itend = end();
253 while (ithere != itend)
254 {
255 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
256 else {
257 // put a space or a question mark depending on what
258 // the character is. Question marks tell the user that
259 // they are missing some information.
260 if (is_unicode_space (*ithere)) cstr[len] = ' ';
261 else cstr[len] = '?';
262 }
263 len++;
264 ithere++;
265 }
266
267 return (char *)cstr;
268}
269
270char *text_t::getcstr() const
271{
272 unsigned char *cstr = new unsigned char[size() + 1];
273 const_iterator ithere = begin();
274 const_iterator itend = end();
275 int len = 0;
276
277 while (ithere != itend)
278 {
279 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
280 else {
281 // put a space or a question mark depending on what
282 // the character is. Question marks tell the user that
283 // they are missing some information.
284 if (is_unicode_space (*ithere)) cstr[len] = ' ';
285 else cstr[len] = '?';
286 }
287 len++;
288 ithere++;
289 }
290
291 cstr[len] = '\0';
292
293 return (char *)cstr;
294}
295
296
297// general functions which work on text_ts
298
299// find a character within a range
300text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
301 unsigned short c)
302{
303 while (first != last)
304 {
305 if (*first == c) break;
306 first++;
307 }
308 return first;
309}
310
311text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
312 unsigned short c)
313{
314 while (first != last)
315 {
316 if (*first == c) break;
317 first++;
318 }
319 return first;
320}
321
322// get a string up to the next delimiter (which is skipped)
323text_t::const_iterator getdelimitstr (text_t::const_iterator first,
324 text_t::const_iterator last,
325 unsigned short c, text_t &outstr)
326{
327 text_t::const_iterator here = first;
328 here = findchar (first, last, c);
329 outstr.clear();
330 outstr.appendrange (first, here);
331 if (here != last) here++; // skip c
332 return here;
333}
334
335text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
336 unsigned short c, text_t &outstr)
337{
338 text_t::iterator here = first;
339 here = findchar (first, last, c);
340 outstr.clear();
341 outstr.appendrange (first, here);
342 if (here != last) here++; // skip c
343 return here;
344}
345
346// split a string with a character
347void splitchar (text_t::const_iterator first, text_t::const_iterator last,
348 unsigned short c, text_tset &outlist)
349{
350 outlist.erase(outlist.begin(), outlist.end());
351
352 text_t t;
353
354 while (first != last)
355 {
356 first = getdelimitstr (first, last, c, t);
357 outlist.insert (t);
358 }
359}
360
361void splitchar (text_t::const_iterator first, text_t::const_iterator last,
362 unsigned short c, text_tlist &outlist)
363{
364 outlist.erase(outlist.begin(), outlist.end());
365
366 text_t t;
367
368 while (first != last)
369 {
370 first = getdelimitstr (first, last, c, t);
371 outlist.push_back (t);
372 }
373}
374
375void splitchar (text_t::const_iterator first, text_t::const_iterator last,
376 unsigned short c, text_tarray &outlist)
377{
378 outlist.erase(outlist.begin(), outlist.end());
379
380 text_t t;
381
382 while (first != last)
383 {
384 first = getdelimitstr (first, last, c, t);
385 outlist.push_back (t);
386 }
387}
388
389// join a string using a character
390void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
391{
392 outtext.clear ();
393
394 text_tset::const_iterator here = inlist.begin ();
395 text_tset::const_iterator end = inlist.end ();
396 bool first = true;
397 while (here != end)
398 {
399 if (!first) outtext.push_back (c);
400 first = false;
401 outtext += *here;
402 here++;
403 }
404}
405
406void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
407{
408 outtext.clear ();
409
410 text_tlist::const_iterator here = inlist.begin ();
411 text_tlist::const_iterator end = inlist.end ();
412 bool first = true;
413 while (here != end)
414 {
415 if (!first) outtext.push_back (c);
416 first = false;
417 outtext += *here;
418 here++;
419 }
420}
421
422void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
423{
424 outtext.clear ();
425
426 text_tarray::const_iterator here = inlist.begin ();
427 text_tarray::const_iterator end = inlist.end ();
428 bool first = true;
429 while (here != end)
430 {
431 if (!first) outtext.push_back (c);
432 first = false;
433 outtext += *here;
434 here++;
435 }
436}
437
438void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
439{
440 outtext.clear ();
441
442 text_tarray::const_iterator here = inlist.begin ();
443 text_tarray::const_iterator end = inlist.end ();
444 bool first = true;
445 while (here != end)
446 {
447 if (!first) outtext += c;
448 first = false;
449 outtext += *here;
450 here++;
451 }
452}
453
454// count the occurances of a character within a range
455int countchar (text_t::const_iterator first, text_t::const_iterator last,
456 unsigned short c)
457{
458 int count = 0;
459 while (first != last) {
460 if (*first == c) count ++;
461 first ++;
462 }
463 return count;
464}
465
466// return a substring of string from first up to but not including last
467text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
468
469 text_t substr;
470 while (first != last) {
471 substr.push_back(*first);
472 first ++;
473 }
474 return substr;
475}
476
477
478// convert to lowercase
479void lc (text_t::iterator first, text_t::iterator last) {
480 while (first != last) {
481 *first = unicode_tolower(*first);
482 first++;
483 }
484}
485
486// convert to uppercase
487void uc (text_t::iterator first, text_t::iterator last) {
488 while (first != last) {
489 *first = unicode_toupper(*first);
490 first++;
491 }
492}
493
494
495// checks to see if it is a number (i.e. contains only 0-9)
496bool is_number (const text_t &text) {
497
498 text_t::const_iterator here = text.begin();
499 text_t::const_iterator end = text.end();
500
501 while (here != end) {
502 if ((*here!='0') && (*here!='1') && (*here!='2') &&
503 (*here!='3') && (*here!='4') && (*here!='5') &&
504 (*here!='6') && (*here!='7') && (*here!='8') &&
505 (*here!='9')) return false;
506 here ++;
507 }
508 return true;
509}
510
511
512// checks to see if the text has any letters or digits
513bool has_unicode_letdig (const text_t &text) {
514 if (text.empty()) return false;
515
516 text_t::const_iterator here = text.begin();
517 text_t::const_iterator end = text.end();
518 while (here != end) {
519 if (is_unicode_letdig (*here)) return true;
520 here++;
521 }
522
523 return false;
524}
525
526
527
528////////////////////////////////////
529// convertclass methods
530////////////////////////////////////
531
532// conversion classes used for getting information in to and out of
533// the text_t class.
534
535convertclass::convertclass ()
536{
537 // nothing to do
538}
539
540void convertclass::reset ()
541{
542 // nothing to do
543}
544
545
546////////////////////////////////////
547// inconvertclass methods
548////////////////////////////////////
549
550// convert from a char stream to the text_t class
551// the default version assumes the input is a ascii
552// character array
553
554inconvertclass::inconvertclass ()
555{
556 start = NULL;
557 len = 0;
558}
559
560
561void inconvertclass::reset ()
562{
563 start = NULL;
564 len = 0;
565}
566
567void inconvertclass::setinput (char *thestart, size_t thelen)
568{
569 start = thestart;
570 len = thelen;
571}
572
573void inconvertclass::convert (text_t &output, status_t &status)
574{
575 output.clear();
576
577 if (start == NULL || len == 0)
578 {
579 status = finished;
580 return;
581 }
582
583 // don't want any funny sign conversions happening
584 unsigned char *here = (unsigned char *)start;
585 while (len > 0)
586 {
587 output.push_back (*here); // append this character
588 ++here;
589 --len;
590 }
591
592 start = (char *)here; // save current position
593 status = finished;
594}
595
596// will treat the text_t as a 8-bit string and convert
597// it to a 16-bit string using the about convert method.
598text_t inconvertclass::convert (const text_t &t) {
599 text_t out;
600 text_t tmpout;
601 status_t status;
602 text_t::const_iterator here = t.begin();
603 text_t::const_iterator end = t.end();
604 unsigned char cbuf[256];
605 size_t cbuflen = 0;
606
607 while (here != end) {
608 while (here != end && cbuflen < 256) {
609 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
610 here++;
611 }
612
613 if (cbuflen > 0) {
614 setinput ((char *)cbuf, cbuflen);
615 status = unfinished;
616 while (status == unfinished) {
617 convert (tmpout, status);
618 out += tmpout;
619 }
620 cbuflen = 0;
621 }
622 }
623
624 out.setencoding (0); // unicode
625
626 return out;
627}
628
629// an instance of the default inconvertclass to do simple
630// conversions. Note that any functions that use this are
631// not reentrant. If a function needs to be reentrant it
632// should declare its own instance.
633inconvertclass ascii2text_t;
634
635
636////////////////////////////////////
637// outconvertclass methods
638////////////////////////////////////
639
640// Convert from a text_t class to a char stream
641// This default version assumes the output is a ascii
642// character array. If you set the output stream you
643// can use this class to output to a stream using the
644// << operator. The << operator can also be conveniently
645// used to set the output stream by doing something like
646//
647// cout << text_t2ascii << text_tstr << anothertext_tstr;
648//
649outconvertclass::outconvertclass ()
650{
651 input = NULL;
652 outs = NULL;
653}
654
655void outconvertclass::reset ()
656{
657 input = NULL;
658 outs = NULL;
659}
660
661void outconvertclass::setinput (text_t *theinput)
662{
663 input = theinput;
664 if (input != NULL) texthere = input->begin();
665}
666
667void outconvertclass::convert (char *output, size_t maxlen,
668 size_t &len, status_t &status)
669{
670 if (input == NULL || output == NULL)
671 {
672 status = finished;
673 return;
674 }
675
676 // don't want any funny sign conversions happening
677 unsigned char *uoutput = (unsigned char *)output;
678 text_t::iterator textend = input->end();
679 len = 0;
680 while ((len < maxlen) && (texthere != textend))
681 {
682 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
683 else {
684 // put a space or a question mark depending on what
685 // the character is. Question marks tell the user that
686 // they are missing some information.
687 if (is_unicode_space (*texthere)) *uoutput = ' ';
688 else *uoutput = '?';
689 }
690 ++uoutput;
691 ++len;
692 ++texthere;
693 }
694
695 if (texthere == textend) status = finished;
696 else status = unfinished;
697}
698
699// will convert the 16-bit string to a 8-bit stream
700// and place the result in a text_t. This method uses
701// the above convert function.
702text_t outconvertclass::convert (const text_t &t) {
703 text_t out;
704 unsigned char cbuf[256];
705 size_t cbuflen = 0;
706 status_t status = unfinished;
707
708 setinput ((text_t *)&t); // discard constant
709 while (status == unfinished) {
710 convert ((char *)cbuf, 256, cbuflen, status);
711 out.appendcarr ((char *)cbuf, cbuflen);
712 }
713
714 out.setencoding (1); // other encoding
715
716 return out;
717}
718
719
720void outconvertclass::setostream (ostream *theouts)
721{
722 outs = theouts;
723}
724
725ostream *outconvertclass::getostream ()
726{
727 return outs;
728}
729
730
731
732
733// an instance of the default outconvertclass to do simple
734// conversions
735outconvertclass text_t2ascii;
736
737
738
739// stream operators for the output class
740
741outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
742{
743 outconverter.setostream(&theouts);
744 return outconverter;
745}
746
747
748#define STREAMBUFSIZE 256
749outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
750{
751 ostream *outstream = outconverter.getostream();
752
753 if (outstream == NULL) return outconverter;
754
755 char outbuf[STREAMBUFSIZE];
756 size_t len;
757 outconvertclass::status_t status = outconvertclass::unfinished;
758
759 // assume that there is no data needing converting
760 // left in the converter
761 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
762
763 while (status == outconvertclass::unfinished)
764 {
765 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
766 if (len > 0) outstream->write(outbuf, len);
767 }
768
769 return outconverter;
770}
Note: See TracBrowser for help on using the repository browser.