source: trunk/gsdl/lib/text_t.cpp@ 481

Last change on this file since 481 was 480, checked in by rjmcnab, 25 years ago

Fixed a small but hard to find bug in getcarr

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.9 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit charater string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: text_t.cpp 480 1999-08-31 08:04:41Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.12 1999/08/31 08:04:41 rjmcnab
15 Fixed a small but hard to find bug in getcarr
16
17 Revision 1.11 1999/07/01 04:05:09 rjmcnab
18 Optimised append functions slightly and added a reserve function.
19
20 Revision 1.10 1999/04/26 03:58:03 sjboddie
21 added is_number function
22
23 Revision 1.9 1999/04/06 22:17:24 rjmcnab
24 Added splits and joins using text_tset.
25
26 Revision 1.8 1999/02/28 23:14:41 rjmcnab
27
28 Added uc and lc to convert to uppercase and lowercase.
29
30 Revision 1.7 1999/02/21 22:26:39 rjmcnab
31
32 Made getint() a constant function.
33
34 Revision 1.6 1999/02/03 01:13:26 sjboddie
35
36 Got interface to handle subcollections and language subcollections -
37 committed changes made to some of the collections
38
39 Revision 1.5 1999/01/19 01:38:14 rjmcnab
40
41 Made the source more portable.
42
43 Revision 1.4 1999/01/12 01:51:00 rjmcnab
44
45 Standard header.
46
47 Revision 1.3 1999/01/08 02:33:16 rjmcnab
48
49 Added standard header to source files.
50
51 */
52
53
54#include "text_t.h"
55
56#if defined(GSDL_USE_OBJECTSPACE)
57# include <ospace\std\algorithm>
58#elif defined(GSDL_USE_STL_H)
59# if defined(GSDL_USE_ALGO_H)
60# include <algo.h>
61# else
62# include <algorithm.h>
63# endif
64#else
65# include <algorithm>
66#endif
67
68
69#include "unitool.h"
70
71////////////////////////////////////
72// text_t methods
73////////////////////////////////////
74
75text_t::text_t ()
76{
77 setencoding(0);
78 clear ();
79}
80
81text_t::text_t (int i)
82{
83 setencoding(0);
84 clear ();
85 appendint (i);
86}
87
88text_t::text_t (char *s)
89{
90 setencoding(0);
91 clear ();
92 appendcstr (s);
93}
94
95void text_t::append (const text_t &t)
96{
97 text.insert(text.end(), t.begin(), t.end());
98 // const_iterator here, end=t.end();
99 // for (here=t.begin(); here!=end;here++)
100 // {
101 // text.push_back(*here);
102 // }
103}
104
105void text_t::appendrange (iterator first, iterator last)
106{
107 text.insert(text.end(), first, last);
108 // while (first != last)
109 // {
110 // text.push_back (*first);
111 // first++;
112 // }
113}
114
115void text_t::appendrange (const_iterator first, const_iterator last)
116{
117 text.insert(text.end(), first, last);
118 // while (first != last)
119 // {
120 // text.push_back (*first);
121 // first++;
122 // }
123}
124
125void text_t::appendint (int i)
126{
127 // deal with zeros and negatives
128 if (i == 0)
129 {
130 text.push_back('0');
131 return;
132 }
133 else if (i < 0)
134 {
135 text.push_back('-');
136 i *= -1;
137 }
138
139 // get a buffer for the conversion
140 int maxbuflen = sizeof(int)*3;
141 char *buf = new char[maxbuflen];
142 int len = 0;
143
144 // get the number in reverse
145 while (i > 0)
146 {
147 buf[len++] = '0'+ (i%10);
148 i = i/10;
149 }
150
151 // reverse the number
152 while (len > 0)
153 {
154 text.push_back(buf[--len]);
155 }
156
157 delete buf;
158}
159
160int text_t::getint () const
161{
162 int i = 0;
163 int mult = 1; // become -1 for negative numbers
164
165 const_iterator here = text.begin();
166 const_iterator end = text.end();
167
168 // do plus and minus signs
169 if (here != end)
170 {
171 if (*here == '-')
172 {
173 mult = -1;
174 here++;
175 }
176 else if (*here == '+')
177 {
178 mult = 1;
179 here++;
180 }
181 }
182
183 // deal with the number
184 while ((here != end) && (*here >= '0') && (*here <= '9'))
185 {
186 i = 10*i + (*here - '0');
187 here++;
188 }
189
190 i *= mult;
191 return i;
192}
193
194
195
196void text_t::appendcarr (char *s, size_type len)
197{
198 unsigned char *us = (unsigned char *)s;
199 while (len > 0)
200 {
201 text.push_back (*us); // append this character
202 us++;
203 len--;
204 }
205}
206
207void text_t::appendcstr (char *s)
208{
209 unsigned char *us = (unsigned char *)s;
210 while (*us != '\0')
211 {
212 text.push_back (*us); // append this character
213 us++;
214 }
215}
216
217
218// strings returned from getcarr and getcstr become the callers
219// responsibility and should be deallocated with "delete"
220
221char *text_t::getcarr(size_type &len) const
222{
223 unsigned char *cstr = new unsigned char[size()];
224 len = 0;
225
226 const_iterator ithere = begin();
227 const_iterator itend = end();
228 while (ithere != itend)
229 {
230 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
231 else {
232 // put a space or a question mark depending on what
233 // the character is. Question marks tell the user that
234 // they are missing some information.
235 if (is_unicode_space (*ithere)) cstr[len] = ' ';
236 else cstr[len] = '?';
237 }
238 len++;
239 ithere++;
240 }
241
242 return (char *)cstr;
243}
244
245char *text_t::getcstr() const
246{
247 unsigned char *cstr = new unsigned char[size() + 1];
248 const_iterator ithere = begin();
249 const_iterator itend = end();
250 int len = 0;
251
252 while (ithere != itend)
253 {
254 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
255 else {
256 // put a space or a question mark depending on what
257 // the character is. Question marks tell the user that
258 // they are missing some information.
259 if (is_unicode_space (*ithere)) cstr[len] = ' ';
260 else cstr[len] = '?';
261 }
262 len++;
263 ithere++;
264 }
265
266 cstr[len] = '\0';
267
268 return (char *)cstr;
269}
270
271
272// general functions which work on text_ts
273
274// find a character within a range
275text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
276 unsigned short c)
277{
278 while (first != last)
279 {
280 if (*first == c) break;
281 first++;
282 }
283 return first;
284}
285
286text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
287 unsigned short c)
288{
289 while (first != last)
290 {
291 if (*first == c) break;
292 first++;
293 }
294 return first;
295}
296
297// get a string up to the next delimiter (which is skipped)
298text_t::const_iterator getdelimitstr (text_t::const_iterator first,
299 text_t::const_iterator last,
300 unsigned short c, text_t &outstr)
301{
302 text_t::const_iterator here = first;
303 here = findchar (first, last, c);
304 outstr.clear();
305 outstr.appendrange (first, here);
306 if (here != last) here++; // skip c
307 return here;
308}
309
310text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
311 unsigned short c, text_t &outstr)
312{
313 text_t::iterator here = first;
314 here = findchar (first, last, c);
315 outstr.clear();
316 outstr.appendrange (first, here);
317 if (here != last) here++; // skip c
318 return here;
319}
320
321// split a string with a character
322void splitchar (text_t::const_iterator first, text_t::const_iterator last,
323 unsigned short c, text_tset &outlist)
324{
325 outlist.erase(outlist.begin(), outlist.end());
326
327 text_t t;
328
329 while (first != last)
330 {
331 first = getdelimitstr (first, last, c, t);
332 outlist.insert (t);
333 }
334}
335
336void splitchar (text_t::const_iterator first, text_t::const_iterator last,
337 unsigned short c, text_tlist &outlist)
338{
339 outlist.erase(outlist.begin(), outlist.end());
340
341 text_t t;
342
343 while (first != last)
344 {
345 first = getdelimitstr (first, last, c, t);
346 outlist.push_back (t);
347 }
348}
349
350void splitchar (text_t::const_iterator first, text_t::const_iterator last,
351 unsigned short c, text_tarray &outlist)
352{
353 outlist.erase(outlist.begin(), outlist.end());
354
355 text_t t;
356
357 while (first != last)
358 {
359 first = getdelimitstr (first, last, c, t);
360 outlist.push_back (t);
361 }
362}
363
364// join a string using a character
365void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
366{
367 outtext.clear ();
368
369 text_tset::const_iterator here = inlist.begin ();
370 text_tset::const_iterator end = inlist.end ();
371 bool first = true;
372 while (here != end)
373 {
374 if (!first) outtext.push_back (c);
375 first = false;
376 outtext += *here;
377 here++;
378 }
379}
380
381void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
382{
383 outtext.clear ();
384
385 text_tlist::const_iterator here = inlist.begin ();
386 text_tlist::const_iterator end = inlist.end ();
387 bool first = true;
388 while (here != end)
389 {
390 if (!first) outtext.push_back (c);
391 first = false;
392 outtext += *here;
393 here++;
394 }
395}
396
397void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
398{
399 outtext.clear ();
400
401 text_tarray::const_iterator here = inlist.begin ();
402 text_tarray::const_iterator end = inlist.end ();
403 bool first = true;
404 while (here != end)
405 {
406 if (!first) outtext.push_back (c);
407 first = false;
408 outtext += *here;
409 here++;
410 }
411}
412
413// count the occurances of a character within a range
414int countchar (text_t::const_iterator first, text_t::const_iterator last,
415 unsigned short c)
416{
417 int count = 0;
418 while (first != last) {
419 if (*first == c) count ++;
420 first ++;
421 }
422 return count;
423}
424
425// return a substring of string from first up to but not including last
426text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
427
428 text_t substr;
429 while (first != last) {
430 substr.push_back(*first);
431 first ++;
432 }
433 return substr;
434}
435
436
437// convert to lowercase
438void lc (text_t::iterator first, text_t::iterator last) {
439 while (first != last) {
440 *first = unicode_tolower(*first);
441 first++;
442 }
443}
444
445// convert to uppercase
446void uc (text_t::iterator first, text_t::iterator last) {
447 while (first != last) {
448 *first = unicode_toupper(*first);
449 first++;
450 }
451}
452
453
454// checks to see if it is a number (i.e. contains only 0-9)
455bool is_number (const text_t &text) {
456
457 text_t::const_iterator here = text.begin();
458 text_t::const_iterator end = text.end();
459
460 while (here != end) {
461 if ((*here!='0') && (*here!='1') && (*here!='2') &&
462 (*here!='3') && (*here!='4') && (*here!='5') &&
463 (*here!='6') && (*here!='7') && (*here!='8') &&
464 (*here!='9')) return false;
465 here ++;
466 }
467 return true;
468}
469
470
471
472////////////////////////////////////
473// convertclass methods
474////////////////////////////////////
475
476// conversion classes used for getting information in to and out of
477// the text_t class.
478
479convertclass::convertclass ()
480{
481 // nothing to do
482}
483
484void convertclass::reset ()
485{
486 // nothing to do
487}
488
489
490////////////////////////////////////
491// inconvertclass methods
492////////////////////////////////////
493
494// convert from a char stream to the text_t class
495// the default version assumes the input is a ascii
496// character array
497
498inconvertclass::inconvertclass ()
499{
500 start = NULL;
501 len = 0;
502}
503
504
505void inconvertclass::reset ()
506{
507 start = NULL;
508 len = 0;
509}
510
511void inconvertclass::setinput (char *thestart, size_t thelen)
512{
513 start = thestart;
514 len = thelen;
515}
516
517void inconvertclass::convert (text_t &output, status_t &status)
518{
519 output.clear();
520
521 if (start == NULL || len == 0)
522 {
523 status = finished;
524 return;
525 }
526
527 // don't want any funny sign conversions happening
528 unsigned char *here = (unsigned char *)start;
529 while (len > 0)
530 {
531 output.push_back (*here); // append this character
532 ++here;
533 --len;
534 }
535
536 start = (char *)here; // save current position
537 status = finished;
538}
539
540// will treat the text_t as a 8-bit string and convert
541// it to a 16-bit string using the about convert method.
542text_t inconvertclass::convert (const text_t &t) {
543 text_t out;
544 text_t tmpout;
545 status_t status;
546 text_t::const_iterator here = t.begin();
547 text_t::const_iterator end = t.end();
548 unsigned char cbuf[256];
549 size_t cbuflen = 0;
550
551 while (here != end) {
552 while (here != end && cbuflen < 256) {
553 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
554 here++;
555 }
556
557 if (cbuflen > 0) {
558 setinput ((char *)cbuf, cbuflen);
559 status = unfinished;
560 while (status == unfinished) {
561 convert (tmpout, status);
562 out += tmpout;
563 }
564 cbuflen = 0;
565 }
566 }
567
568 out.setencoding (0); // unicode
569
570 return out;
571}
572
573// an instance of the default inconvertclass to do simple
574// conversions. Note that any functions that use this are
575// not reentrant. If a function needs to be reentrant it
576// should declare its own instance.
577inconvertclass ascii2text_t;
578
579
580////////////////////////////////////
581// outconvertclass methods
582////////////////////////////////////
583
584// Convert from a text_t class to a char stream
585// This default version assumes the output is a ascii
586// character array. If you set the output stream you
587// can use this class to output to a stream using the
588// << operator. The << operator can also be conveniently
589// used to set the output stream by doing something like
590//
591// cout << text_t2ascii << text_tstr << anothertext_tstr;
592//
593outconvertclass::outconvertclass ()
594{
595 input = NULL;
596 outs = NULL;
597}
598
599void outconvertclass::reset ()
600{
601 input = NULL;
602 outs = NULL;
603}
604
605void outconvertclass::setinput (text_t *theinput)
606{
607 input = theinput;
608 if (input != NULL) texthere = input->begin();
609}
610
611void outconvertclass::convert (char *output, size_t maxlen,
612 size_t &len, status_t &status)
613{
614 if (input == NULL || output == NULL)
615 {
616 status = finished;
617 return;
618 }
619
620 // don't want any funny sign conversions happening
621 unsigned char *uoutput = (unsigned char *)output;
622 text_t::iterator textend = input->end();
623 len = 0;
624 while ((len < maxlen) && (texthere != textend))
625 {
626 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
627 else {
628 // put a space or a question mark depending on what
629 // the character is. Question marks tell the user that
630 // they are missing some information.
631 if (is_unicode_space (*texthere)) *uoutput = ' ';
632 else *uoutput = '?';
633 }
634 ++uoutput;
635 ++len;
636 ++texthere;
637 }
638
639 if (texthere == textend) status = finished;
640 else status = unfinished;
641}
642
643// will convert the 16-bit string to a 8-bit stream
644// and place the result in a text_t. This method uses
645// the above convert function.
646text_t outconvertclass::convert (const text_t &t) {
647 text_t out;
648 unsigned char cbuf[256];
649 size_t cbuflen = 0;
650 status_t status = unfinished;
651
652 setinput ((text_t *)&t); // discard constant
653 while (status == unfinished) {
654 convert ((char *)cbuf, 256, cbuflen, status);
655 out.appendcarr ((char *)cbuf, cbuflen);
656 }
657
658 out.setencoding (1); // other encoding
659
660 return out;
661}
662
663
664void outconvertclass::setostream (ostream *theouts)
665{
666 outs = theouts;
667}
668
669ostream *outconvertclass::getostream ()
670{
671 return outs;
672}
673
674
675
676
677// an instance of the default outconvertclass to do simple
678// conversions
679outconvertclass text_t2ascii;
680
681
682
683// stream operators for the output class
684
685outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
686{
687 outconverter.setostream(&theouts);
688 return outconverter;
689}
690
691
692#define STREAMBUFSIZE 256
693outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
694{
695 ostream *outstream = outconverter.getostream();
696
697 if (outstream == NULL) return outconverter;
698
699 char outbuf[STREAMBUFSIZE];
700 size_t len;
701 outconvertclass::status_t status = outconvertclass::unfinished;
702
703 // assume that there is no data needing converting
704 // left in the converter
705 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
706
707 while (status == outconvertclass::unfinished)
708 {
709 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
710 if (len > 0) outstream->write(outbuf, len);
711 }
712
713 return outconverter;
714}
Note: See TracBrowser for help on using the repository browser.