source: branches/New_Config_Format-branch/gsdl/lib/text_t.cpp@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.6 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 1279 2000-07-12 22:21:53Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.17.2.1 2000/07/12 22:20:56 sjboddie
31 merged changes to trunk into New_Config_Format branch
32
33 Revision 1.18 2000/04/14 02:50:12 sjboddie
34 added text_t versions of joinchar to work with sets and lists
35
36 Revision 1.17 2000/04/06 19:58:03 cs025
37 Correcting a correction - reinstated all lib files due to silly
38 CVS confusion.
39
40 Revision 1.15 1999/10/14 22:52:39 sjboddie
41 joinchar can join using text_t string now too
42
43 Revision 1.14 1999/09/24 02:30:03 rjmcnab
44 added function has_unicode_letdig
45
46 Revision 1.13 1999/09/07 04:57:43 sjboddie
47 added gpl notice
48
49 Revision 1.12 1999/08/31 08:04:41 rjmcnab
50 Fixed a small but hard to find bug in getcarr
51
52 Revision 1.11 1999/07/01 04:05:09 rjmcnab
53 Optimised append functions slightly and added a reserve function.
54
55 Revision 1.10 1999/04/26 03:58:03 sjboddie
56 added is_number function
57
58 Revision 1.9 1999/04/06 22:17:24 rjmcnab
59 Added splits and joins using text_tset.
60
61 Revision 1.8 1999/02/28 23:14:41 rjmcnab
62
63 Added uc and lc to convert to uppercase and lowercase.
64
65 Revision 1.7 1999/02/21 22:26:39 rjmcnab
66
67 Made getint() a constant function.
68
69 Revision 1.6 1999/02/03 01:13:26 sjboddie
70
71 Got interface to handle subcollections and language subcollections -
72 committed changes made to some of the collections
73
74 Revision 1.5 1999/01/19 01:38:14 rjmcnab
75
76 Made the source more portable.
77
78 Revision 1.4 1999/01/12 01:51:00 rjmcnab
79
80 Standard header.
81
82 Revision 1.3 1999/01/08 02:33:16 rjmcnab
83
84 Added standard header to source files.
85
86 */
87
88
89#include "text_t.h"
90
91#if defined(GSDL_USE_OBJECTSPACE)
92# include <ospace\std\algorithm>
93#elif defined(GSDL_USE_STL_H)
94# if defined(GSDL_USE_ALGO_H)
95# include <algo.h>
96# else
97# include <algorithm.h>
98# endif
99#else
100# include <algorithm>
101#endif
102
103
104#include "unitool.h"
105
106////////////////////////////////////
107// text_t methods
108////////////////////////////////////
109
110text_t::text_t ()
111{
112 setencoding(0);
113 clear ();
114}
115
116text_t::text_t (int i)
117{
118 setencoding(0);
119 clear ();
120 appendint (i);
121}
122
123text_t::text_t (char *s)
124{
125 setencoding(0);
126 clear ();
127 appendcstr (s);
128}
129
130void text_t::append (const text_t &t)
131{
132 text.insert(text.end(), t.begin(), t.end());
133 // const_iterator here, end=t.end();
134 // for (here=t.begin(); here!=end;here++)
135 // {
136 // text.push_back(*here);
137 // }
138}
139
140void text_t::appendrange (iterator first, iterator last)
141{
142 text.insert(text.end(), first, last);
143 // while (first != last)
144 // {
145 // text.push_back (*first);
146 // first++;
147 // }
148}
149
150void text_t::appendrange (const_iterator first, const_iterator last)
151{
152 text.insert(text.end(), first, last);
153 // while (first != last)
154 // {
155 // text.push_back (*first);
156 // first++;
157 // }
158}
159
160void text_t::appendint (int i)
161{
162 // deal with zeros and negatives
163 if (i == 0)
164 {
165 text.push_back('0');
166 return;
167 }
168 else if (i < 0)
169 {
170 text.push_back('-');
171 i *= -1;
172 }
173
174 // get a buffer for the conversion
175 int maxbuflen = sizeof(int)*3;
176 char *buf = new char[maxbuflen];
177 int len = 0;
178
179 // get the number in reverse
180 while (i > 0)
181 {
182 buf[len++] = '0'+ (i%10);
183 i = i/10;
184 }
185
186 // reverse the number
187 while (len > 0)
188 {
189 text.push_back(buf[--len]);
190 }
191
192 delete buf;
193}
194
195int text_t::getint () const
196{
197 int i = 0;
198 int mult = 1; // become -1 for negative numbers
199
200 const_iterator here = text.begin();
201 const_iterator end = text.end();
202
203 // do plus and minus signs
204 if (here != end)
205 {
206 if (*here == '-')
207 {
208 mult = -1;
209 here++;
210 }
211 else if (*here == '+')
212 {
213 mult = 1;
214 here++;
215 }
216 }
217
218 // deal with the number
219 while ((here != end) && (*here >= '0') && (*here <= '9'))
220 {
221 i = 10*i + (*here - '0');
222 here++;
223 }
224
225 i *= mult;
226 return i;
227}
228
229
230
231void text_t::appendcarr (char *s, size_type len)
232{
233 unsigned char *us = (unsigned char *)s;
234 while (len > 0)
235 {
236 text.push_back (*us); // append this character
237 us++;
238 len--;
239 }
240}
241
242void text_t::appendcstr (char *s)
243{
244 unsigned char *us = (unsigned char *)s;
245 while (*us != '\0')
246 {
247 text.push_back (*us); // append this character
248 us++;
249 }
250}
251
252
253// strings returned from getcarr and getcstr become the callers
254// responsibility and should be deallocated with "delete"
255
256char *text_t::getcarr(size_type &len) const
257{
258 unsigned char *cstr = new unsigned char[size()];
259 len = 0;
260
261 const_iterator ithere = begin();
262 const_iterator itend = end();
263 while (ithere != itend)
264 {
265 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
266 else {
267 // put a space or a question mark depending on what
268 // the character is. Question marks tell the user that
269 // they are missing some information.
270 if (is_unicode_space (*ithere)) cstr[len] = ' ';
271 else cstr[len] = '?';
272 }
273 len++;
274 ithere++;
275 }
276
277 return (char *)cstr;
278}
279
280char *text_t::getcstr() const
281{
282 unsigned char *cstr = new unsigned char[size() + 1];
283 const_iterator ithere = begin();
284 const_iterator itend = end();
285 int len = 0;
286
287 while (ithere != itend)
288 {
289 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
290 else {
291 // put a space or a question mark depending on what
292 // the character is. Question marks tell the user that
293 // they are missing some information.
294 if (is_unicode_space (*ithere)) cstr[len] = ' ';
295 else cstr[len] = '?';
296 }
297 len++;
298 ithere++;
299 }
300
301 cstr[len] = '\0';
302
303 return (char *)cstr;
304}
305
306
307// general functions which work on text_ts
308
309// find a character within a range
310text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
311 unsigned short c)
312{
313 while (first != last)
314 {
315 if (*first == c) break;
316 first++;
317 }
318 return first;
319}
320
321text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
322 unsigned short c)
323{
324 while (first != last)
325 {
326 if (*first == c) break;
327 first++;
328 }
329 return first;
330}
331
332// get a string up to the next delimiter (which is skipped)
333text_t::const_iterator getdelimitstr (text_t::const_iterator first,
334 text_t::const_iterator last,
335 unsigned short c, text_t &outstr)
336{
337 text_t::const_iterator here = first;
338 here = findchar (first, last, c);
339 outstr.clear();
340 outstr.appendrange (first, here);
341 if (here != last) here++; // skip c
342 return here;
343}
344
345text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
346 unsigned short c, text_t &outstr)
347{
348 text_t::iterator here = first;
349 here = findchar (first, last, c);
350 outstr.clear();
351 outstr.appendrange (first, here);
352 if (here != last) here++; // skip c
353 return here;
354}
355
356// split a string with a character
357void splitchar (text_t::const_iterator first, text_t::const_iterator last,
358 unsigned short c, text_tset &outlist)
359{
360 outlist.erase(outlist.begin(), outlist.end());
361
362 text_t t;
363
364 while (first != last)
365 {
366 first = getdelimitstr (first, last, c, t);
367 outlist.insert (t);
368 }
369}
370
371void splitchar (text_t::const_iterator first, text_t::const_iterator last,
372 unsigned short c, text_tlist &outlist)
373{
374 outlist.erase(outlist.begin(), outlist.end());
375
376 text_t t;
377
378 while (first != last)
379 {
380 first = getdelimitstr (first, last, c, t);
381 outlist.push_back (t);
382 }
383}
384
385void splitchar (text_t::const_iterator first, text_t::const_iterator last,
386 unsigned short c, text_tarray &outlist)
387{
388 outlist.erase(outlist.begin(), outlist.end());
389
390 text_t t;
391
392 while (first != last)
393 {
394 first = getdelimitstr (first, last, c, t);
395 outlist.push_back (t);
396 }
397}
398
399// join a string using a character
400void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
401{
402 outtext.clear ();
403
404 text_tset::const_iterator here = inlist.begin ();
405 text_tset::const_iterator end = inlist.end ();
406 bool first = true;
407 while (here != end)
408 {
409 if (!first) outtext.push_back (c);
410 first = false;
411 outtext += *here;
412 here++;
413 }
414}
415
416void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
417{
418 outtext.clear ();
419
420 text_tlist::const_iterator here = inlist.begin ();
421 text_tlist::const_iterator end = inlist.end ();
422 bool first = true;
423 while (here != end)
424 {
425 if (!first) outtext.push_back (c);
426 first = false;
427 outtext += *here;
428 here++;
429 }
430}
431
432void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
433{
434 outtext.clear ();
435
436 text_tarray::const_iterator here = inlist.begin ();
437 text_tarray::const_iterator end = inlist.end ();
438 bool first = true;
439 while (here != end)
440 {
441 if (!first) outtext.push_back (c);
442 first = false;
443 outtext += *here;
444 here++;
445 }
446}
447
448void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
449{
450 outtext.clear ();
451
452 text_tlist::const_iterator here = inlist.begin ();
453 text_tlist::const_iterator end = inlist.end ();
454 bool first = true;
455 while (here != end)
456 {
457 if (!first) outtext += c;
458 first = false;
459 outtext += *here;
460 here++;
461 }
462}
463
464void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
465{
466 outtext.clear ();
467
468 text_tset::const_iterator here = inlist.begin ();
469 text_tset::const_iterator end = inlist.end ();
470 bool first = true;
471 while (here != end)
472 {
473 if (!first) outtext += c;
474 first = false;
475 outtext += *here;
476 here++;
477 }
478}
479
480void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
481{
482 outtext.clear ();
483
484 text_tarray::const_iterator here = inlist.begin ();
485 text_tarray::const_iterator end = inlist.end ();
486 bool first = true;
487 while (here != end)
488 {
489 if (!first) outtext += c;
490 first = false;
491 outtext += *here;
492 here++;
493 }
494}
495
496// count the occurances of a character within a range
497int countchar (text_t::const_iterator first, text_t::const_iterator last,
498 unsigned short c)
499{
500 int count = 0;
501 while (first != last) {
502 if (*first == c) count ++;
503 first ++;
504 }
505 return count;
506}
507
508// return a substring of string from first up to but not including last
509text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
510
511 text_t substr;
512 while (first != last) {
513 substr.push_back(*first);
514 first ++;
515 }
516 return substr;
517}
518
519
520// convert to lowercase
521void lc (text_t::iterator first, text_t::iterator last) {
522 while (first != last) {
523 *first = unicode_tolower(*first);
524 first++;
525 }
526}
527
528// convert to uppercase
529void uc (text_t::iterator first, text_t::iterator last) {
530 while (first != last) {
531 *first = unicode_toupper(*first);
532 first++;
533 }
534}
535
536
537// checks to see if it is a number (i.e. contains only 0-9)
538bool is_number (const text_t &text) {
539
540 text_t::const_iterator here = text.begin();
541 text_t::const_iterator end = text.end();
542
543 while (here != end) {
544 if ((*here!='0') && (*here!='1') && (*here!='2') &&
545 (*here!='3') && (*here!='4') && (*here!='5') &&
546 (*here!='6') && (*here!='7') && (*here!='8') &&
547 (*here!='9')) return false;
548 here ++;
549 }
550 return true;
551}
552
553
554// checks to see if the text has any letters or digits
555bool has_unicode_letdig (const text_t &text) {
556 if (text.empty()) return false;
557
558 text_t::const_iterator here = text.begin();
559 text_t::const_iterator end = text.end();
560 while (here != end) {
561 if (is_unicode_letdig (*here)) return true;
562 here++;
563 }
564
565 return false;
566}
567
568
569
570////////////////////////////////////
571// convertclass methods
572////////////////////////////////////
573
574// conversion classes used for getting information in to and out of
575// the text_t class.
576
577convertclass::convertclass ()
578{
579 // nothing to do
580}
581
582void convertclass::reset ()
583{
584 // nothing to do
585}
586
587
588////////////////////////////////////
589// inconvertclass methods
590////////////////////////////////////
591
592// convert from a char stream to the text_t class
593// the default version assumes the input is a ascii
594// character array
595
596inconvertclass::inconvertclass ()
597{
598 start = NULL;
599 len = 0;
600}
601
602
603void inconvertclass::reset ()
604{
605 start = NULL;
606 len = 0;
607}
608
609void inconvertclass::setinput (char *thestart, size_t thelen)
610{
611 start = thestart;
612 len = thelen;
613}
614
615void inconvertclass::convert (text_t &output, status_t &status)
616{
617 output.clear();
618
619 if (start == NULL || len == 0)
620 {
621 status = finished;
622 return;
623 }
624
625 // don't want any funny sign conversions happening
626 unsigned char *here = (unsigned char *)start;
627 while (len > 0)
628 {
629 output.push_back (*here); // append this character
630 ++here;
631 --len;
632 }
633
634 start = (char *)here; // save current position
635 status = finished;
636}
637
638// will treat the text_t as a 8-bit string and convert
639// it to a 16-bit string using the about convert method.
640text_t inconvertclass::convert (const text_t &t) {
641 text_t out;
642 text_t tmpout;
643 status_t status;
644 text_t::const_iterator here = t.begin();
645 text_t::const_iterator end = t.end();
646 unsigned char cbuf[256];
647 size_t cbuflen = 0;
648
649 while (here != end) {
650 while (here != end && cbuflen < 256) {
651 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
652 here++;
653 }
654
655 if (cbuflen > 0) {
656 setinput ((char *)cbuf, cbuflen);
657 status = unfinished;
658 while (status == unfinished) {
659 convert (tmpout, status);
660 out += tmpout;
661 }
662 cbuflen = 0;
663 }
664 }
665
666 out.setencoding (0); // unicode
667
668 return out;
669}
670
671// an instance of the default inconvertclass to do simple
672// conversions. Note that any functions that use this are
673// not reentrant. If a function needs to be reentrant it
674// should declare its own instance.
675inconvertclass ascii2text_t;
676
677
678////////////////////////////////////
679// outconvertclass methods
680////////////////////////////////////
681
682// Convert from a text_t class to a char stream
683// This default version assumes the output is a ascii
684// character array. If you set the output stream you
685// can use this class to output to a stream using the
686// << operator. The << operator can also be conveniently
687// used to set the output stream by doing something like
688//
689// cout << text_t2ascii << text_tstr << anothertext_tstr;
690//
691outconvertclass::outconvertclass ()
692{
693 input = NULL;
694 outs = NULL;
695}
696
697void outconvertclass::reset ()
698{
699 input = NULL;
700 outs = NULL;
701}
702
703void outconvertclass::setinput (text_t *theinput)
704{
705 input = theinput;
706 if (input != NULL) texthere = input->begin();
707}
708
709void outconvertclass::convert (char *output, size_t maxlen,
710 size_t &len, status_t &status)
711{
712 if (input == NULL || output == NULL)
713 {
714 status = finished;
715 return;
716 }
717
718 // don't want any funny sign conversions happening
719 unsigned char *uoutput = (unsigned char *)output;
720 text_t::iterator textend = input->end();
721 len = 0;
722 while ((len < maxlen) && (texthere != textend))
723 {
724 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
725 else {
726 // put a space or a question mark depending on what
727 // the character is. Question marks tell the user that
728 // they are missing some information.
729 if (is_unicode_space (*texthere)) *uoutput = ' ';
730 else *uoutput = '?';
731 }
732 ++uoutput;
733 ++len;
734 ++texthere;
735 }
736
737 if (texthere == textend) status = finished;
738 else status = unfinished;
739}
740
741// will convert the 16-bit string to a 8-bit stream
742// and place the result in a text_t. This method uses
743// the above convert function.
744text_t outconvertclass::convert (const text_t &t) {
745 text_t out;
746 unsigned char cbuf[256];
747 size_t cbuflen = 0;
748 status_t status = unfinished;
749
750 setinput ((text_t *)&t); // discard constant
751 while (status == unfinished) {
752 convert ((char *)cbuf, 256, cbuflen, status);
753 out.appendcarr ((char *)cbuf, cbuflen);
754 }
755
756 out.setencoding (1); // other encoding
757
758 return out;
759}
760
761
762void outconvertclass::setostream (ostream *theouts)
763{
764 outs = theouts;
765}
766
767ostream *outconvertclass::getostream ()
768{
769 return outs;
770}
771
772
773
774
775// an instance of the default outconvertclass to do simple
776// conversions
777outconvertclass text_t2ascii;
778
779
780
781// stream operators for the output class
782
783outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
784{
785 outconverter.setostream(&theouts);
786 return outconverter;
787}
788
789
790#define STREAMBUFSIZE 256
791outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
792{
793 ostream *outstream = outconverter.getostream();
794
795 if (outstream == NULL) return outconverter;
796
797 char outbuf[STREAMBUFSIZE];
798 size_t len;
799 outconvertclass::status_t status = outconvertclass::unfinished;
800
801 // assume that there is no data needing converting
802 // left in the converter
803 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
804
805 while (status == outconvertclass::unfinished)
806 {
807 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
808 if (len > 0) outstream->write(outbuf, len);
809 }
810
811 return outconverter;
812}
Note: See TracBrowser for help on using the repository browser.