source: trunk/gsdl/lib/text_t.cpp@ 1076

Last change on this file since 1076 was 1076, checked in by cs025, 24 years ago

Correcting a correction - reinstated all lib files due to silly
CVS confusion.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 1076 2000-04-06 19:58:04Z cs025 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.17 2000/04/06 19:58:03 cs025
31 Correcting a correction - reinstated all lib files due to silly
32 CVS confusion.
33
34 Revision 1.15 1999/10/14 22:52:39 sjboddie
35 joinchar can join using text_t string now too
36
37 Revision 1.14 1999/09/24 02:30:03 rjmcnab
38 added function has_unicode_letdig
39
40 Revision 1.13 1999/09/07 04:57:43 sjboddie
41 added gpl notice
42
43 Revision 1.12 1999/08/31 08:04:41 rjmcnab
44 Fixed a small but hard to find bug in getcarr
45
46 Revision 1.11 1999/07/01 04:05:09 rjmcnab
47 Optimised append functions slightly and added a reserve function.
48
49 Revision 1.10 1999/04/26 03:58:03 sjboddie
50 added is_number function
51
52 Revision 1.9 1999/04/06 22:17:24 rjmcnab
53 Added splits and joins using text_tset.
54
55 Revision 1.8 1999/02/28 23:14:41 rjmcnab
56
57 Added uc and lc to convert to uppercase and lowercase.
58
59 Revision 1.7 1999/02/21 22:26:39 rjmcnab
60
61 Made getint() a constant function.
62
63 Revision 1.6 1999/02/03 01:13:26 sjboddie
64
65 Got interface to handle subcollections and language subcollections -
66 committed changes made to some of the collections
67
68 Revision 1.5 1999/01/19 01:38:14 rjmcnab
69
70 Made the source more portable.
71
72 Revision 1.4 1999/01/12 01:51:00 rjmcnab
73
74 Standard header.
75
76 Revision 1.3 1999/01/08 02:33:16 rjmcnab
77
78 Added standard header to source files.
79
80 */
81
82
83#include "text_t.h"
84
85#if defined(GSDL_USE_OBJECTSPACE)
86# include <ospace\std\algorithm>
87#elif defined(GSDL_USE_STL_H)
88# if defined(GSDL_USE_ALGO_H)
89# include <algo.h>
90# else
91# include <algorithm.h>
92# endif
93#else
94# include <algorithm>
95#endif
96
97
98#include "unitool.h"
99
100////////////////////////////////////
101// text_t methods
102////////////////////////////////////
103
104text_t::text_t ()
105{
106 setencoding(0);
107 clear ();
108}
109
110text_t::text_t (int i)
111{
112 setencoding(0);
113 clear ();
114 appendint (i);
115}
116
117text_t::text_t (char *s)
118{
119 setencoding(0);
120 clear ();
121 appendcstr (s);
122}
123
124void text_t::append (const text_t &t)
125{
126 text.insert(text.end(), t.begin(), t.end());
127 // const_iterator here, end=t.end();
128 // for (here=t.begin(); here!=end;here++)
129 // {
130 // text.push_back(*here);
131 // }
132}
133
134void text_t::appendrange (iterator first, iterator last)
135{
136 text.insert(text.end(), first, last);
137 // while (first != last)
138 // {
139 // text.push_back (*first);
140 // first++;
141 // }
142}
143
144void text_t::appendrange (const_iterator first, const_iterator last)
145{
146 text.insert(text.end(), first, last);
147 // while (first != last)
148 // {
149 // text.push_back (*first);
150 // first++;
151 // }
152}
153
154void text_t::appendint (int i)
155{
156 // deal with zeros and negatives
157 if (i == 0)
158 {
159 text.push_back('0');
160 return;
161 }
162 else if (i < 0)
163 {
164 text.push_back('-');
165 i *= -1;
166 }
167
168 // get a buffer for the conversion
169 int maxbuflen = sizeof(int)*3;
170 char *buf = new char[maxbuflen];
171 int len = 0;
172
173 // get the number in reverse
174 while (i > 0)
175 {
176 buf[len++] = '0'+ (i%10);
177 i = i/10;
178 }
179
180 // reverse the number
181 while (len > 0)
182 {
183 text.push_back(buf[--len]);
184 }
185
186 delete buf;
187}
188
189int text_t::getint () const
190{
191 int i = 0;
192 int mult = 1; // become -1 for negative numbers
193
194 const_iterator here = text.begin();
195 const_iterator end = text.end();
196
197 // do plus and minus signs
198 if (here != end)
199 {
200 if (*here == '-')
201 {
202 mult = -1;
203 here++;
204 }
205 else if (*here == '+')
206 {
207 mult = 1;
208 here++;
209 }
210 }
211
212 // deal with the number
213 while ((here != end) && (*here >= '0') && (*here <= '9'))
214 {
215 i = 10*i + (*here - '0');
216 here++;
217 }
218
219 i *= mult;
220 return i;
221}
222
223
224
225void text_t::appendcarr (char *s, size_type len)
226{
227 unsigned char *us = (unsigned char *)s;
228 while (len > 0)
229 {
230 text.push_back (*us); // append this character
231 us++;
232 len--;
233 }
234}
235
236void text_t::appendcstr (char *s)
237{
238 unsigned char *us = (unsigned char *)s;
239 while (*us != '\0')
240 {
241 text.push_back (*us); // append this character
242 us++;
243 }
244}
245
246
247// strings returned from getcarr and getcstr become the callers
248// responsibility and should be deallocated with "delete"
249
250char *text_t::getcarr(size_type &len) const
251{
252 unsigned char *cstr = new unsigned char[size()];
253 len = 0;
254
255 const_iterator ithere = begin();
256 const_iterator itend = end();
257 while (ithere != itend)
258 {
259 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
260 else {
261 // put a space or a question mark depending on what
262 // the character is. Question marks tell the user that
263 // they are missing some information.
264 if (is_unicode_space (*ithere)) cstr[len] = ' ';
265 else cstr[len] = '?';
266 }
267 len++;
268 ithere++;
269 }
270
271 return (char *)cstr;
272}
273
274char *text_t::getcstr() const
275{
276 unsigned char *cstr = new unsigned char[size() + 1];
277 const_iterator ithere = begin();
278 const_iterator itend = end();
279 int len = 0;
280
281 while (ithere != itend)
282 {
283 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
284 else {
285 // put a space or a question mark depending on what
286 // the character is. Question marks tell the user that
287 // they are missing some information.
288 if (is_unicode_space (*ithere)) cstr[len] = ' ';
289 else cstr[len] = '?';
290 }
291 len++;
292 ithere++;
293 }
294
295 cstr[len] = '\0';
296
297 return (char *)cstr;
298}
299
300
301// general functions which work on text_ts
302
303// find a character within a range
304text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
305 unsigned short c)
306{
307 while (first != last)
308 {
309 if (*first == c) break;
310 first++;
311 }
312 return first;
313}
314
315text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
316 unsigned short c)
317{
318 while (first != last)
319 {
320 if (*first == c) break;
321 first++;
322 }
323 return first;
324}
325
326// get a string up to the next delimiter (which is skipped)
327text_t::const_iterator getdelimitstr (text_t::const_iterator first,
328 text_t::const_iterator last,
329 unsigned short c, text_t &outstr)
330{
331 text_t::const_iterator here = first;
332 here = findchar (first, last, c);
333 outstr.clear();
334 outstr.appendrange (first, here);
335 if (here != last) here++; // skip c
336 return here;
337}
338
339text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
340 unsigned short c, text_t &outstr)
341{
342 text_t::iterator here = first;
343 here = findchar (first, last, c);
344 outstr.clear();
345 outstr.appendrange (first, here);
346 if (here != last) here++; // skip c
347 return here;
348}
349
350// split a string with a character
351void splitchar (text_t::const_iterator first, text_t::const_iterator last,
352 unsigned short c, text_tset &outlist)
353{
354 outlist.erase(outlist.begin(), outlist.end());
355
356 text_t t;
357
358 while (first != last)
359 {
360 first = getdelimitstr (first, last, c, t);
361 outlist.insert (t);
362 }
363}
364
365void splitchar (text_t::const_iterator first, text_t::const_iterator last,
366 unsigned short c, text_tlist &outlist)
367{
368 outlist.erase(outlist.begin(), outlist.end());
369
370 text_t t;
371
372 while (first != last)
373 {
374 first = getdelimitstr (first, last, c, t);
375 outlist.push_back (t);
376 }
377}
378
379void splitchar (text_t::const_iterator first, text_t::const_iterator last,
380 unsigned short c, text_tarray &outlist)
381{
382 outlist.erase(outlist.begin(), outlist.end());
383
384 text_t t;
385
386 while (first != last)
387 {
388 first = getdelimitstr (first, last, c, t);
389 outlist.push_back (t);
390 }
391}
392
393// join a string using a character
394void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
395{
396 outtext.clear ();
397
398 text_tset::const_iterator here = inlist.begin ();
399 text_tset::const_iterator end = inlist.end ();
400 bool first = true;
401 while (here != end)
402 {
403 if (!first) outtext.push_back (c);
404 first = false;
405 outtext += *here;
406 here++;
407 }
408}
409
410void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
411{
412 outtext.clear ();
413
414 text_tlist::const_iterator here = inlist.begin ();
415 text_tlist::const_iterator end = inlist.end ();
416 bool first = true;
417 while (here != end)
418 {
419 if (!first) outtext.push_back (c);
420 first = false;
421 outtext += *here;
422 here++;
423 }
424}
425
426void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
427{
428 outtext.clear ();
429
430 text_tarray::const_iterator here = inlist.begin ();
431 text_tarray::const_iterator end = inlist.end ();
432 bool first = true;
433 while (here != end)
434 {
435 if (!first) outtext.push_back (c);
436 first = false;
437 outtext += *here;
438 here++;
439 }
440}
441
442void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
443{
444 outtext.clear ();
445
446 text_tarray::const_iterator here = inlist.begin ();
447 text_tarray::const_iterator end = inlist.end ();
448 bool first = true;
449 while (here != end)
450 {
451 if (!first) outtext += c;
452 first = false;
453 outtext += *here;
454 here++;
455 }
456}
457
458// count the occurances of a character within a range
459int countchar (text_t::const_iterator first, text_t::const_iterator last,
460 unsigned short c)
461{
462 int count = 0;
463 while (first != last) {
464 if (*first == c) count ++;
465 first ++;
466 }
467 return count;
468}
469
470// return a substring of string from first up to but not including last
471text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
472
473 text_t substr;
474 while (first != last) {
475 substr.push_back(*first);
476 first ++;
477 }
478 return substr;
479}
480
481
482// convert to lowercase
483void lc (text_t::iterator first, text_t::iterator last) {
484 while (first != last) {
485 *first = unicode_tolower(*first);
486 first++;
487 }
488}
489
490// convert to uppercase
491void uc (text_t::iterator first, text_t::iterator last) {
492 while (first != last) {
493 *first = unicode_toupper(*first);
494 first++;
495 }
496}
497
498
499// checks to see if it is a number (i.e. contains only 0-9)
500bool is_number (const text_t &text) {
501
502 text_t::const_iterator here = text.begin();
503 text_t::const_iterator end = text.end();
504
505 while (here != end) {
506 if ((*here!='0') && (*here!='1') && (*here!='2') &&
507 (*here!='3') && (*here!='4') && (*here!='5') &&
508 (*here!='6') && (*here!='7') && (*here!='8') &&
509 (*here!='9')) return false;
510 here ++;
511 }
512 return true;
513}
514
515
516// checks to see if the text has any letters or digits
517bool has_unicode_letdig (const text_t &text) {
518 if (text.empty()) return false;
519
520 text_t::const_iterator here = text.begin();
521 text_t::const_iterator end = text.end();
522 while (here != end) {
523 if (is_unicode_letdig (*here)) return true;
524 here++;
525 }
526
527 return false;
528}
529
530
531
532////////////////////////////////////
533// convertclass methods
534////////////////////////////////////
535
536// conversion classes used for getting information in to and out of
537// the text_t class.
538
539convertclass::convertclass ()
540{
541 // nothing to do
542}
543
544void convertclass::reset ()
545{
546 // nothing to do
547}
548
549
550////////////////////////////////////
551// inconvertclass methods
552////////////////////////////////////
553
554// convert from a char stream to the text_t class
555// the default version assumes the input is a ascii
556// character array
557
558inconvertclass::inconvertclass ()
559{
560 start = NULL;
561 len = 0;
562}
563
564
565void inconvertclass::reset ()
566{
567 start = NULL;
568 len = 0;
569}
570
571void inconvertclass::setinput (char *thestart, size_t thelen)
572{
573 start = thestart;
574 len = thelen;
575}
576
577void inconvertclass::convert (text_t &output, status_t &status)
578{
579 output.clear();
580
581 if (start == NULL || len == 0)
582 {
583 status = finished;
584 return;
585 }
586
587 // don't want any funny sign conversions happening
588 unsigned char *here = (unsigned char *)start;
589 while (len > 0)
590 {
591 output.push_back (*here); // append this character
592 ++here;
593 --len;
594 }
595
596 start = (char *)here; // save current position
597 status = finished;
598}
599
600// will treat the text_t as a 8-bit string and convert
601// it to a 16-bit string using the about convert method.
602text_t inconvertclass::convert (const text_t &t) {
603 text_t out;
604 text_t tmpout;
605 status_t status;
606 text_t::const_iterator here = t.begin();
607 text_t::const_iterator end = t.end();
608 unsigned char cbuf[256];
609 size_t cbuflen = 0;
610
611 while (here != end) {
612 while (here != end && cbuflen < 256) {
613 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
614 here++;
615 }
616
617 if (cbuflen > 0) {
618 setinput ((char *)cbuf, cbuflen);
619 status = unfinished;
620 while (status == unfinished) {
621 convert (tmpout, status);
622 out += tmpout;
623 }
624 cbuflen = 0;
625 }
626 }
627
628 out.setencoding (0); // unicode
629
630 return out;
631}
632
633// an instance of the default inconvertclass to do simple
634// conversions. Note that any functions that use this are
635// not reentrant. If a function needs to be reentrant it
636// should declare its own instance.
637inconvertclass ascii2text_t;
638
639
640////////////////////////////////////
641// outconvertclass methods
642////////////////////////////////////
643
644// Convert from a text_t class to a char stream
645// This default version assumes the output is a ascii
646// character array. If you set the output stream you
647// can use this class to output to a stream using the
648// << operator. The << operator can also be conveniently
649// used to set the output stream by doing something like
650//
651// cout << text_t2ascii << text_tstr << anothertext_tstr;
652//
653outconvertclass::outconvertclass ()
654{
655 input = NULL;
656 outs = NULL;
657}
658
659void outconvertclass::reset ()
660{
661 input = NULL;
662 outs = NULL;
663}
664
665void outconvertclass::setinput (text_t *theinput)
666{
667 input = theinput;
668 if (input != NULL) texthere = input->begin();
669}
670
671void outconvertclass::convert (char *output, size_t maxlen,
672 size_t &len, status_t &status)
673{
674 if (input == NULL || output == NULL)
675 {
676 status = finished;
677 return;
678 }
679
680 // don't want any funny sign conversions happening
681 unsigned char *uoutput = (unsigned char *)output;
682 text_t::iterator textend = input->end();
683 len = 0;
684 while ((len < maxlen) && (texthere != textend))
685 {
686 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
687 else {
688 // put a space or a question mark depending on what
689 // the character is. Question marks tell the user that
690 // they are missing some information.
691 if (is_unicode_space (*texthere)) *uoutput = ' ';
692 else *uoutput = '?';
693 }
694 ++uoutput;
695 ++len;
696 ++texthere;
697 }
698
699 if (texthere == textend) status = finished;
700 else status = unfinished;
701}
702
703// will convert the 16-bit string to a 8-bit stream
704// and place the result in a text_t. This method uses
705// the above convert function.
706text_t outconvertclass::convert (const text_t &t) {
707 text_t out;
708 unsigned char cbuf[256];
709 size_t cbuflen = 0;
710 status_t status = unfinished;
711
712 setinput ((text_t *)&t); // discard constant
713 while (status == unfinished) {
714 convert ((char *)cbuf, 256, cbuflen, status);
715 out.appendcarr ((char *)cbuf, cbuflen);
716 }
717
718 out.setencoding (1); // other encoding
719
720 return out;
721}
722
723
724void outconvertclass::setostream (ostream *theouts)
725{
726 outs = theouts;
727}
728
729ostream *outconvertclass::getostream ()
730{
731 return outs;
732}
733
734
735
736
737// an instance of the default outconvertclass to do simple
738// conversions
739outconvertclass text_t2ascii;
740
741
742
743// stream operators for the output class
744
745outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
746{
747 outconverter.setostream(&theouts);
748 return outconverter;
749}
750
751
752#define STREAMBUFSIZE 256
753outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
754{
755 ostream *outstream = outconverter.getostream();
756
757 if (outstream == NULL) return outconverter;
758
759 char outbuf[STREAMBUFSIZE];
760 size_t len;
761 outconvertclass::status_t status = outconvertclass::unfinished;
762
763 // assume that there is no data needing converting
764 // left in the converter
765 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
766
767 while (status == outconvertclass::unfinished)
768 {
769 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
770 if (len > 0) outstream->write(outbuf, len);
771 }
772
773 return outconverter;
774}
Note: See TracBrowser for help on using the repository browser.