source: trunk/gsdl/lib/text_t.cpp@ 595

Last change on this file since 595 was 534, checked in by sjboddie, 25 years ago

added gpl notice

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 534 1999-09-07 04:57:43Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.13 1999/09/07 04:57:43 sjboddie
31 added gpl notice
32
33 Revision 1.12 1999/08/31 08:04:41 rjmcnab
34 Fixed a small but hard to find bug in getcarr
35
36 Revision 1.11 1999/07/01 04:05:09 rjmcnab
37 Optimised append functions slightly and added a reserve function.
38
39 Revision 1.10 1999/04/26 03:58:03 sjboddie
40 added is_number function
41
42 Revision 1.9 1999/04/06 22:17:24 rjmcnab
43 Added splits and joins using text_tset.
44
45 Revision 1.8 1999/02/28 23:14:41 rjmcnab
46
47 Added uc and lc to convert to uppercase and lowercase.
48
49 Revision 1.7 1999/02/21 22:26:39 rjmcnab
50
51 Made getint() a constant function.
52
53 Revision 1.6 1999/02/03 01:13:26 sjboddie
54
55 Got interface to handle subcollections and language subcollections -
56 committed changes made to some of the collections
57
58 Revision 1.5 1999/01/19 01:38:14 rjmcnab
59
60 Made the source more portable.
61
62 Revision 1.4 1999/01/12 01:51:00 rjmcnab
63
64 Standard header.
65
66 Revision 1.3 1999/01/08 02:33:16 rjmcnab
67
68 Added standard header to source files.
69
70 */
71
72
73#include "text_t.h"
74
75#if defined(GSDL_USE_OBJECTSPACE)
76# include <ospace\std\algorithm>
77#elif defined(GSDL_USE_STL_H)
78# if defined(GSDL_USE_ALGO_H)
79# include <algo.h>
80# else
81# include <algorithm.h>
82# endif
83#else
84# include <algorithm>
85#endif
86
87
88#include "unitool.h"
89
90////////////////////////////////////
91// text_t methods
92////////////////////////////////////
93
94text_t::text_t ()
95{
96 setencoding(0);
97 clear ();
98}
99
100text_t::text_t (int i)
101{
102 setencoding(0);
103 clear ();
104 appendint (i);
105}
106
107text_t::text_t (char *s)
108{
109 setencoding(0);
110 clear ();
111 appendcstr (s);
112}
113
114void text_t::append (const text_t &t)
115{
116 text.insert(text.end(), t.begin(), t.end());
117 // const_iterator here, end=t.end();
118 // for (here=t.begin(); here!=end;here++)
119 // {
120 // text.push_back(*here);
121 // }
122}
123
124void text_t::appendrange (iterator first, iterator last)
125{
126 text.insert(text.end(), first, last);
127 // while (first != last)
128 // {
129 // text.push_back (*first);
130 // first++;
131 // }
132}
133
134void text_t::appendrange (const_iterator first, const_iterator last)
135{
136 text.insert(text.end(), first, last);
137 // while (first != last)
138 // {
139 // text.push_back (*first);
140 // first++;
141 // }
142}
143
144void text_t::appendint (int i)
145{
146 // deal with zeros and negatives
147 if (i == 0)
148 {
149 text.push_back('0');
150 return;
151 }
152 else if (i < 0)
153 {
154 text.push_back('-');
155 i *= -1;
156 }
157
158 // get a buffer for the conversion
159 int maxbuflen = sizeof(int)*3;
160 char *buf = new char[maxbuflen];
161 int len = 0;
162
163 // get the number in reverse
164 while (i > 0)
165 {
166 buf[len++] = '0'+ (i%10);
167 i = i/10;
168 }
169
170 // reverse the number
171 while (len > 0)
172 {
173 text.push_back(buf[--len]);
174 }
175
176 delete buf;
177}
178
179int text_t::getint () const
180{
181 int i = 0;
182 int mult = 1; // become -1 for negative numbers
183
184 const_iterator here = text.begin();
185 const_iterator end = text.end();
186
187 // do plus and minus signs
188 if (here != end)
189 {
190 if (*here == '-')
191 {
192 mult = -1;
193 here++;
194 }
195 else if (*here == '+')
196 {
197 mult = 1;
198 here++;
199 }
200 }
201
202 // deal with the number
203 while ((here != end) && (*here >= '0') && (*here <= '9'))
204 {
205 i = 10*i + (*here - '0');
206 here++;
207 }
208
209 i *= mult;
210 return i;
211}
212
213
214
215void text_t::appendcarr (char *s, size_type len)
216{
217 unsigned char *us = (unsigned char *)s;
218 while (len > 0)
219 {
220 text.push_back (*us); // append this character
221 us++;
222 len--;
223 }
224}
225
226void text_t::appendcstr (char *s)
227{
228 unsigned char *us = (unsigned char *)s;
229 while (*us != '\0')
230 {
231 text.push_back (*us); // append this character
232 us++;
233 }
234}
235
236
237// strings returned from getcarr and getcstr become the callers
238// responsibility and should be deallocated with "delete"
239
240char *text_t::getcarr(size_type &len) const
241{
242 unsigned char *cstr = new unsigned char[size()];
243 len = 0;
244
245 const_iterator ithere = begin();
246 const_iterator itend = end();
247 while (ithere != itend)
248 {
249 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
250 else {
251 // put a space or a question mark depending on what
252 // the character is. Question marks tell the user that
253 // they are missing some information.
254 if (is_unicode_space (*ithere)) cstr[len] = ' ';
255 else cstr[len] = '?';
256 }
257 len++;
258 ithere++;
259 }
260
261 return (char *)cstr;
262}
263
264char *text_t::getcstr() const
265{
266 unsigned char *cstr = new unsigned char[size() + 1];
267 const_iterator ithere = begin();
268 const_iterator itend = end();
269 int len = 0;
270
271 while (ithere != itend)
272 {
273 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
274 else {
275 // put a space or a question mark depending on what
276 // the character is. Question marks tell the user that
277 // they are missing some information.
278 if (is_unicode_space (*ithere)) cstr[len] = ' ';
279 else cstr[len] = '?';
280 }
281 len++;
282 ithere++;
283 }
284
285 cstr[len] = '\0';
286
287 return (char *)cstr;
288}
289
290
291// general functions which work on text_ts
292
293// find a character within a range
294text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
295 unsigned short c)
296{
297 while (first != last)
298 {
299 if (*first == c) break;
300 first++;
301 }
302 return first;
303}
304
305text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
306 unsigned short c)
307{
308 while (first != last)
309 {
310 if (*first == c) break;
311 first++;
312 }
313 return first;
314}
315
316// get a string up to the next delimiter (which is skipped)
317text_t::const_iterator getdelimitstr (text_t::const_iterator first,
318 text_t::const_iterator last,
319 unsigned short c, text_t &outstr)
320{
321 text_t::const_iterator here = first;
322 here = findchar (first, last, c);
323 outstr.clear();
324 outstr.appendrange (first, here);
325 if (here != last) here++; // skip c
326 return here;
327}
328
329text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
330 unsigned short c, text_t &outstr)
331{
332 text_t::iterator here = first;
333 here = findchar (first, last, c);
334 outstr.clear();
335 outstr.appendrange (first, here);
336 if (here != last) here++; // skip c
337 return here;
338}
339
340// split a string with a character
341void splitchar (text_t::const_iterator first, text_t::const_iterator last,
342 unsigned short c, text_tset &outlist)
343{
344 outlist.erase(outlist.begin(), outlist.end());
345
346 text_t t;
347
348 while (first != last)
349 {
350 first = getdelimitstr (first, last, c, t);
351 outlist.insert (t);
352 }
353}
354
355void splitchar (text_t::const_iterator first, text_t::const_iterator last,
356 unsigned short c, text_tlist &outlist)
357{
358 outlist.erase(outlist.begin(), outlist.end());
359
360 text_t t;
361
362 while (first != last)
363 {
364 first = getdelimitstr (first, last, c, t);
365 outlist.push_back (t);
366 }
367}
368
369void splitchar (text_t::const_iterator first, text_t::const_iterator last,
370 unsigned short c, text_tarray &outlist)
371{
372 outlist.erase(outlist.begin(), outlist.end());
373
374 text_t t;
375
376 while (first != last)
377 {
378 first = getdelimitstr (first, last, c, t);
379 outlist.push_back (t);
380 }
381}
382
383// join a string using a character
384void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
385{
386 outtext.clear ();
387
388 text_tset::const_iterator here = inlist.begin ();
389 text_tset::const_iterator end = inlist.end ();
390 bool first = true;
391 while (here != end)
392 {
393 if (!first) outtext.push_back (c);
394 first = false;
395 outtext += *here;
396 here++;
397 }
398}
399
400void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
401{
402 outtext.clear ();
403
404 text_tlist::const_iterator here = inlist.begin ();
405 text_tlist::const_iterator end = inlist.end ();
406 bool first = true;
407 while (here != end)
408 {
409 if (!first) outtext.push_back (c);
410 first = false;
411 outtext += *here;
412 here++;
413 }
414}
415
416void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
417{
418 outtext.clear ();
419
420 text_tarray::const_iterator here = inlist.begin ();
421 text_tarray::const_iterator end = inlist.end ();
422 bool first = true;
423 while (here != end)
424 {
425 if (!first) outtext.push_back (c);
426 first = false;
427 outtext += *here;
428 here++;
429 }
430}
431
432// count the occurances of a character within a range
433int countchar (text_t::const_iterator first, text_t::const_iterator last,
434 unsigned short c)
435{
436 int count = 0;
437 while (first != last) {
438 if (*first == c) count ++;
439 first ++;
440 }
441 return count;
442}
443
444// return a substring of string from first up to but not including last
445text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
446
447 text_t substr;
448 while (first != last) {
449 substr.push_back(*first);
450 first ++;
451 }
452 return substr;
453}
454
455
456// convert to lowercase
457void lc (text_t::iterator first, text_t::iterator last) {
458 while (first != last) {
459 *first = unicode_tolower(*first);
460 first++;
461 }
462}
463
464// convert to uppercase
465void uc (text_t::iterator first, text_t::iterator last) {
466 while (first != last) {
467 *first = unicode_toupper(*first);
468 first++;
469 }
470}
471
472
473// checks to see if it is a number (i.e. contains only 0-9)
474bool is_number (const text_t &text) {
475
476 text_t::const_iterator here = text.begin();
477 text_t::const_iterator end = text.end();
478
479 while (here != end) {
480 if ((*here!='0') && (*here!='1') && (*here!='2') &&
481 (*here!='3') && (*here!='4') && (*here!='5') &&
482 (*here!='6') && (*here!='7') && (*here!='8') &&
483 (*here!='9')) return false;
484 here ++;
485 }
486 return true;
487}
488
489
490
491////////////////////////////////////
492// convertclass methods
493////////////////////////////////////
494
495// conversion classes used for getting information in to and out of
496// the text_t class.
497
498convertclass::convertclass ()
499{
500 // nothing to do
501}
502
503void convertclass::reset ()
504{
505 // nothing to do
506}
507
508
509////////////////////////////////////
510// inconvertclass methods
511////////////////////////////////////
512
513// convert from a char stream to the text_t class
514// the default version assumes the input is a ascii
515// character array
516
517inconvertclass::inconvertclass ()
518{
519 start = NULL;
520 len = 0;
521}
522
523
524void inconvertclass::reset ()
525{
526 start = NULL;
527 len = 0;
528}
529
530void inconvertclass::setinput (char *thestart, size_t thelen)
531{
532 start = thestart;
533 len = thelen;
534}
535
536void inconvertclass::convert (text_t &output, status_t &status)
537{
538 output.clear();
539
540 if (start == NULL || len == 0)
541 {
542 status = finished;
543 return;
544 }
545
546 // don't want any funny sign conversions happening
547 unsigned char *here = (unsigned char *)start;
548 while (len > 0)
549 {
550 output.push_back (*here); // append this character
551 ++here;
552 --len;
553 }
554
555 start = (char *)here; // save current position
556 status = finished;
557}
558
559// will treat the text_t as a 8-bit string and convert
560// it to a 16-bit string using the about convert method.
561text_t inconvertclass::convert (const text_t &t) {
562 text_t out;
563 text_t tmpout;
564 status_t status;
565 text_t::const_iterator here = t.begin();
566 text_t::const_iterator end = t.end();
567 unsigned char cbuf[256];
568 size_t cbuflen = 0;
569
570 while (here != end) {
571 while (here != end && cbuflen < 256) {
572 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
573 here++;
574 }
575
576 if (cbuflen > 0) {
577 setinput ((char *)cbuf, cbuflen);
578 status = unfinished;
579 while (status == unfinished) {
580 convert (tmpout, status);
581 out += tmpout;
582 }
583 cbuflen = 0;
584 }
585 }
586
587 out.setencoding (0); // unicode
588
589 return out;
590}
591
592// an instance of the default inconvertclass to do simple
593// conversions. Note that any functions that use this are
594// not reentrant. If a function needs to be reentrant it
595// should declare its own instance.
596inconvertclass ascii2text_t;
597
598
599////////////////////////////////////
600// outconvertclass methods
601////////////////////////////////////
602
603// Convert from a text_t class to a char stream
604// This default version assumes the output is a ascii
605// character array. If you set the output stream you
606// can use this class to output to a stream using the
607// << operator. The << operator can also be conveniently
608// used to set the output stream by doing something like
609//
610// cout << text_t2ascii << text_tstr << anothertext_tstr;
611//
612outconvertclass::outconvertclass ()
613{
614 input = NULL;
615 outs = NULL;
616}
617
618void outconvertclass::reset ()
619{
620 input = NULL;
621 outs = NULL;
622}
623
624void outconvertclass::setinput (text_t *theinput)
625{
626 input = theinput;
627 if (input != NULL) texthere = input->begin();
628}
629
630void outconvertclass::convert (char *output, size_t maxlen,
631 size_t &len, status_t &status)
632{
633 if (input == NULL || output == NULL)
634 {
635 status = finished;
636 return;
637 }
638
639 // don't want any funny sign conversions happening
640 unsigned char *uoutput = (unsigned char *)output;
641 text_t::iterator textend = input->end();
642 len = 0;
643 while ((len < maxlen) && (texthere != textend))
644 {
645 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
646 else {
647 // put a space or a question mark depending on what
648 // the character is. Question marks tell the user that
649 // they are missing some information.
650 if (is_unicode_space (*texthere)) *uoutput = ' ';
651 else *uoutput = '?';
652 }
653 ++uoutput;
654 ++len;
655 ++texthere;
656 }
657
658 if (texthere == textend) status = finished;
659 else status = unfinished;
660}
661
662// will convert the 16-bit string to a 8-bit stream
663// and place the result in a text_t. This method uses
664// the above convert function.
665text_t outconvertclass::convert (const text_t &t) {
666 text_t out;
667 unsigned char cbuf[256];
668 size_t cbuflen = 0;
669 status_t status = unfinished;
670
671 setinput ((text_t *)&t); // discard constant
672 while (status == unfinished) {
673 convert ((char *)cbuf, 256, cbuflen, status);
674 out.appendcarr ((char *)cbuf, cbuflen);
675 }
676
677 out.setencoding (1); // other encoding
678
679 return out;
680}
681
682
683void outconvertclass::setostream (ostream *theouts)
684{
685 outs = theouts;
686}
687
688ostream *outconvertclass::getostream ()
689{
690 return outs;
691}
692
693
694
695
696// an instance of the default outconvertclass to do simple
697// conversions
698outconvertclass text_t2ascii;
699
700
701
702// stream operators for the output class
703
704outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
705{
706 outconverter.setostream(&theouts);
707 return outconverter;
708}
709
710
711#define STREAMBUFSIZE 256
712outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
713{
714 ostream *outstream = outconverter.getostream();
715
716 if (outstream == NULL) return outconverter;
717
718 char outbuf[STREAMBUFSIZE];
719 size_t len;
720 outconvertclass::status_t status = outconvertclass::unfinished;
721
722 // assume that there is no data needing converting
723 // left in the converter
724 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
725
726 while (status == outconvertclass::unfinished)
727 {
728 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
729 if (len > 0) outstream->write(outbuf, len);
730 }
731
732 return outconverter;
733}
Note: See TracBrowser for help on using the repository browser.