source: trunk/gsdl/lib/text_t.cpp@ 1860

Last change on this file since 1860 was 1860, checked in by cs025, 23 years ago

Included CORBA branch for first time

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.8 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 1860 2001-01-25 18:26:45Z cs025 $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.20 2001/01/25 18:26:44 cs025
31 Included CORBA branch for first time
32
33 Revision 1.15.2.2 2000/04/05 10:19:38 syeates
34 added automatic conversion to allow text_t's to be <<'ed to ostreams
35
36 Revision 1.15.2.1 2000/04/04 15:02:29 cs025
37 Corba first commit
38
39 Revision 1.15 1999/10/14 22:52:39 sjboddie
40 joinchar can join using text_t string now too
41
42 Revision 1.14 1999/09/24 02:30:03 rjmcnab
43 added function has_unicode_letdig
44
45 Revision 1.13 1999/09/07 04:57:43 sjboddie
46 added gpl notice
47
48 Revision 1.12 1999/08/31 08:04:41 rjmcnab
49 Fixed a small but hard to find bug in getcarr
50
51 Revision 1.11 1999/07/01 04:05:09 rjmcnab
52 Optimised append functions slightly and added a reserve function.
53
54 Revision 1.10 1999/04/26 03:58:03 sjboddie
55 added is_number function
56
57 Revision 1.9 1999/04/06 22:17:24 rjmcnab
58 Added splits and joins using text_tset.
59
60 Revision 1.8 1999/02/28 23:14:41 rjmcnab
61
62 Added uc and lc to convert to uppercase and lowercase.
63
64 Revision 1.7 1999/02/21 22:26:39 rjmcnab
65
66 Made getint() a constant function.
67
68 Revision 1.6 1999/02/03 01:13:26 sjboddie
69
70 Got interface to handle subcollections and language subcollections -
71 committed changes made to some of the collections
72
73 Revision 1.5 1999/01/19 01:38:14 rjmcnab
74
75 Made the source more portable.
76
77 Revision 1.4 1999/01/12 01:51:00 rjmcnab
78
79 Standard header.
80
81 Revision 1.3 1999/01/08 02:33:16 rjmcnab
82
83 Added standard header to source files.
84
85 */
86
87#include "text_t.h"
88
89#if defined(GSDL_USE_OBJECTSPACE)
90# include <ospace\std\algorithm>
91#elif defined(GSDL_USE_STL_H)
92# if defined(GSDL_USE_ALGO_H)
93# include <algo.h>
94# else
95# include <algorithm.h>
96# endif
97#else
98# include <algorithm>
99#endif
100
101#ifdef HAVE_CONFIG_H
102# ifdef __WIN32__
103# include "WIN32cfg.h"
104# else
105# include "config.h"
106# endif
107#endif
108
109
110#include "unitool.h"
111
112////////////////////////////////////
113// text_t methods
114////////////////////////////////////
115
116// new stream converter ...
117ostream& operator<< (ostream &o, const text_t text)
118{
119 text_t::const_iterator ithere = text.begin();
120 text_t::const_iterator itend = text.end();
121
122 while (ithere != itend)
123 {
124 if (*ithere < 256)
125 {
126 o << (unsigned char)(*ithere);
127 }
128 else
129 {
130 // put a space or a question mark depending on what
131 // the character is. Question marks tell the user that
132 // they are missing some information.
133 if (is_unicode_space (*ithere))
134 o << ' ';
135 else
136 o << '?';
137 }
138 ithere++;
139 }
140
141 return o;
142}
143
144text_t::text_t ()
145{
146 setencoding(0);
147 clear ();
148}
149
150text_t::text_t (int i)
151{
152 setencoding(0);
153 clear ();
154 appendint (i);
155}
156
157text_t::text_t (char *s)
158{
159 setencoding(0);
160 clear ();
161 appendcstr (s);
162}
163
164
165void text_t::append (const text_t &t)
166{
167 text.insert(text.end(), t.begin(), t.end());
168 // const_iterator here, end=t.end();
169 // for (here=t.begin(); here!=end;here++)
170 // {
171 // text.push_back(*here);
172 // }
173}
174
175void text_t::appendrange (iterator first, iterator last)
176{
177 text.insert(text.end(), first, last);
178 // while (first != last)
179 // {
180 // text.push_back (*first);
181 // first++;
182 // }
183}
184
185void text_t::appendrange (const_iterator first, const_iterator last)
186{
187 text.insert(text.end(), first, last);
188 // while (first != last)
189 // {
190 // text.push_back (*first);
191 // first++;
192 // }
193}
194
195void text_t::appendint (int i)
196{
197 // deal with zeros and negatives
198 if (i == 0)
199 {
200 text.push_back('0');
201 return;
202 }
203 else if (i < 0)
204 {
205 text.push_back('-');
206 i *= -1;
207 }
208
209 // get a buffer for the conversion
210 int maxbuflen = sizeof(int)*3;
211 char *buf = new char[maxbuflen];
212 int len = 0;
213
214 // get the number in reverse
215 while (i > 0)
216 {
217 buf[len++] = '0'+ (i%10);
218 i = i/10;
219 }
220
221 // reverse the number
222 while (len > 0)
223 {
224 text.push_back(buf[--len]);
225 }
226
227 delete buf;
228}
229
230int text_t::getint () const
231{
232 int i = 0;
233 int mult = 1; // become -1 for negative numbers
234
235 const_iterator here = text.begin();
236 const_iterator end = text.end();
237
238 // do plus and minus signs
239 if (here != end)
240 {
241 if (*here == '-')
242 {
243 mult = -1;
244 here++;
245 }
246 else if (*here == '+')
247 {
248 mult = 1;
249 here++;
250 }
251 }
252
253 // deal with the number
254 while ((here != end) && (*here >= '0') && (*here <= '9'))
255 {
256 i = 10*i + (*here - '0');
257 here++;
258 }
259
260 i *= mult;
261 return i;
262}
263
264
265
266void text_t::appendcarr (char *s, size_type len)
267{
268 unsigned char *us = (unsigned char *)s;
269 while (len > 0)
270 {
271 text.push_back (*us); // append this character
272 us++;
273 len--;
274 }
275}
276
277void text_t::appendcstr (char *s)
278{
279 unsigned char *us = (unsigned char *)s;
280 while (*us != '\0')
281 {
282 text.push_back (*us); // append this character
283 us++;
284 }
285}
286
287
288// strings returned from getcarr and getcstr become the callers
289// responsibility and should be deallocated with "delete"
290
291char *text_t::getcarr(size_type &len) const
292{
293 unsigned char *cstr = new unsigned char[size()];
294 len = 0;
295
296 const_iterator ithere = begin();
297 const_iterator itend = end();
298 while (ithere != itend)
299 {
300 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
301 else {
302 // put a space or a question mark depending on what
303 // the character is. Question marks tell the user that
304 // they are missing some information.
305 if (is_unicode_space (*ithere)) cstr[len] = ' ';
306 else cstr[len] = '?';
307 }
308 len++;
309 ithere++;
310 }
311
312 return (char *)cstr;
313}
314
315char *text_t::getcstr() const
316{
317 unsigned char *cstr = new unsigned char[size() + 1];
318 const_iterator ithere = begin();
319 const_iterator itend = end();
320 int len = 0;
321
322 while (ithere != itend)
323 {
324 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
325 else {
326 // put a space or a question mark depending on what
327 // the character is. Question marks tell the user that
328 // they are missing some information.
329 if (is_unicode_space (*ithere)) cstr[len] = ' ';
330 else cstr[len] = '?';
331 }
332 len++;
333 ithere++;
334 }
335
336 cstr[len] = '\0';
337
338 return (char *)cstr;
339}
340
341
342// general functions which work on text_ts
343
344// find a character within a range
345text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
346 unsigned short c)
347{
348 while (first != last)
349 {
350 if (*first == c) break;
351 first++;
352 }
353 return first;
354}
355
356text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
357 unsigned short c)
358{
359 while (first != last)
360 {
361 if (*first == c) break;
362 first++;
363 }
364 return first;
365}
366
367text_t::iterator findword (text_t::iterator first, text_t::iterator last,
368 const text_t& word)
369{
370 text_t::const_iterator word_begin = word.begin();
371 text_t::const_iterator word_end = word.end();
372
373 while (first != last)
374 {
375 text_t::iterator char_match = first;
376 text_t::const_iterator word_here = word_begin;
377 while (word_here!=word_end)
378 {
379 if (*char_match != *word_here)
380 {
381 break;
382 }
383 char_match++;
384 word_here++;
385 }
386 if (word_here==word_end)
387 {
388 return first;
389 }
390 first++;
391 }
392 return last; // get to here only if there is no match
393}
394
395// get a string up to the next delimiter (which is skipped)
396text_t::const_iterator getdelimitstr (text_t::const_iterator first,
397 text_t::const_iterator last,
398 unsigned short c, text_t &outstr)
399{
400 text_t::const_iterator here = first;
401 here = findchar (first, last, c);
402 outstr.clear();
403 outstr.appendrange (first, here);
404 if (here != last) here++; // skip c
405 return here;
406}
407
408text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
409 unsigned short c, text_t &outstr)
410{
411 text_t::iterator here = first;
412 here = findchar (first, last, c);
413 outstr.clear();
414 outstr.appendrange (first, here);
415 if (here != last) here++; // skip c
416 return here;
417}
418
419// split a string with a character
420void splitchar (text_t::const_iterator first, text_t::const_iterator last,
421 unsigned short c, text_tset &outlist)
422{
423 outlist.erase(outlist.begin(), outlist.end());
424
425 text_t t;
426
427 while (first != last)
428 {
429 first = getdelimitstr (first, last, c, t);
430 outlist.insert (t);
431 }
432}
433
434void splitchar (text_t::const_iterator first, text_t::const_iterator last,
435 unsigned short c, text_tlist &outlist)
436{
437 outlist.erase(outlist.begin(), outlist.end());
438
439 text_t t;
440
441 while (first != last)
442 {
443 first = getdelimitstr (first, last, c, t);
444 outlist.push_back (t);
445 }
446}
447
448void splitchar (text_t::const_iterator first, text_t::const_iterator last,
449 unsigned short c, text_tarray &outlist)
450{
451 outlist.erase(outlist.begin(), outlist.end());
452
453 text_t t;
454
455 while (first != last)
456 {
457 first = getdelimitstr (first, last, c, t);
458 outlist.push_back (t);
459 }
460}
461
462// join a string using a character
463void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
464{
465 outtext.clear ();
466
467 text_tset::const_iterator here = inlist.begin ();
468 text_tset::const_iterator end = inlist.end ();
469 bool first = true;
470 while (here != end)
471 {
472 if (!first) outtext.push_back (c);
473 first = false;
474 outtext += *here;
475 here++;
476 }
477}
478
479void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
480{
481 outtext.clear ();
482
483 text_tlist::const_iterator here = inlist.begin ();
484 text_tlist::const_iterator end = inlist.end ();
485 bool first = true;
486 while (here != end)
487 {
488 if (!first) outtext.push_back (c);
489 first = false;
490 outtext += *here;
491 here++;
492 }
493}
494
495void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
496{
497 outtext.clear ();
498
499 text_tarray::const_iterator here = inlist.begin ();
500 text_tarray::const_iterator end = inlist.end ();
501 bool first = true;
502 while (here != end)
503 {
504 if (!first) outtext.push_back (c);
505 first = false;
506 outtext += *here;
507 here++;
508 }
509}
510
511void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
512{
513 outtext.clear ();
514
515 text_tlist::const_iterator here = inlist.begin ();
516 text_tlist::const_iterator end = inlist.end ();
517 bool first = true;
518 while (here != end)
519 {
520 if (!first) outtext += c;
521 first = false;
522 outtext += *here;
523 here++;
524 }
525}
526
527void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
528{
529 outtext.clear ();
530
531 text_tset::const_iterator here = inlist.begin ();
532 text_tset::const_iterator end = inlist.end ();
533 bool first = true;
534 while (here != end)
535 {
536 if (!first) outtext += c;
537 first = false;
538 outtext += *here;
539 here++;
540 }
541}
542
543void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
544{
545 outtext.clear ();
546
547 text_tarray::const_iterator here = inlist.begin ();
548 text_tarray::const_iterator end = inlist.end ();
549 bool first = true;
550 while (here != end)
551 {
552 if (!first) outtext += c;
553 first = false;
554 outtext += *here;
555 here++;
556 }
557}
558
559// count the occurances of a character within a range
560int countchar (text_t::const_iterator first, text_t::const_iterator last,
561 unsigned short c)
562{
563 int count = 0;
564 while (first != last) {
565 if (*first == c) count ++;
566 first ++;
567 }
568 return count;
569}
570
571// return a substring of string from first up to but not including last
572text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
573
574 text_t substr;
575 while (first != last) {
576 substr.push_back(*first);
577 first ++;
578 }
579 return substr;
580}
581
582
583// convert to lowercase
584void lc (text_t::iterator first, text_t::iterator last) {
585 while (first != last) {
586 *first = unicode_tolower(*first);
587 first++;
588 }
589}
590
591// convert to uppercase
592void uc (text_t::iterator first, text_t::iterator last) {
593 while (first != last) {
594 *first = unicode_toupper(*first);
595 first++;
596 }
597}
598
599
600// checks to see if it is a number (i.e. contains only 0-9)
601bool is_number (const text_t &text) {
602
603 text_t::const_iterator here = text.begin();
604 text_t::const_iterator end = text.end();
605
606 while (here != end) {
607 if ((*here!='0') && (*here!='1') && (*here!='2') &&
608 (*here!='3') && (*here!='4') && (*here!='5') &&
609 (*here!='6') && (*here!='7') && (*here!='8') &&
610 (*here!='9')) return false;
611 here ++;
612 }
613 return true;
614}
615
616
617// checks to see if the text has any letters or digits
618bool has_unicode_letdig (const text_t &text) {
619 if (text.empty()) return false;
620
621 text_t::const_iterator here = text.begin();
622 text_t::const_iterator end = text.end();
623 while (here != end) {
624 if (is_unicode_letdig (*here)) return true;
625 here++;
626 }
627
628 return false;
629}
630
631
632
633////////////////////////////////////
634// convertclass methods
635////////////////////////////////////
636
637// conversion classes used for getting information in to and out of
638// the text_t class.
639
640convertclass::convertclass ()
641{
642 // nothing to do
643}
644
645void convertclass::reset ()
646{
647 // nothing to do
648}
649
650
651////////////////////////////////////
652// inconvertclass methods
653////////////////////////////////////
654
655// convert from a char stream to the text_t class
656// the default version assumes the input is a ascii
657// character array
658
659inconvertclass::inconvertclass ()
660{
661 start = NULL;
662 len = 0;
663}
664
665
666void inconvertclass::reset ()
667{
668 start = NULL;
669 len = 0;
670}
671
672void inconvertclass::setinput (char *thestart, size_t thelen)
673{
674 start = thestart;
675 len = thelen;
676}
677
678void inconvertclass::convert (text_t &output, status_t &status)
679{
680 output.clear();
681
682 if (start == NULL || len == 0)
683 {
684 status = finished;
685 return;
686 }
687
688 // don't want any funny sign conversions happening
689 unsigned char *here = (unsigned char *)start;
690 while (len > 0)
691 {
692 output.push_back (*here); // append this character
693 ++here;
694 --len;
695 }
696
697 start = (char *)here; // save current position
698 status = finished;
699}
700
701// will treat the text_t as a 8-bit string and convert
702// it to a 16-bit string using the about convert method.
703text_t inconvertclass::convert (const text_t &t) {
704 text_t out;
705 text_t tmpout;
706 status_t status;
707 text_t::const_iterator here = t.begin();
708 text_t::const_iterator end = t.end();
709 unsigned char cbuf[256];
710 size_t cbuflen = 0;
711
712 while (here != end) {
713 while (here != end && cbuflen < 256) {
714 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
715 here++;
716 }
717
718 if (cbuflen > 0) {
719 setinput ((char *)cbuf, cbuflen);
720 status = unfinished;
721 while (status == unfinished) {
722 convert (tmpout, status);
723 out += tmpout;
724 }
725 cbuflen = 0;
726 }
727 }
728
729 out.setencoding (0); // unicode
730
731 return out;
732}
733
734// an instance of the default inconvertclass to do simple
735// conversions. Note that any functions that use this are
736// not reentrant. If a function needs to be reentrant it
737// should declare its own instance.
738inconvertclass ascii2text_t;
739
740
741////////////////////////////////////
742// outconvertclass methods
743////////////////////////////////////
744
745// Convert from a text_t class to a char stream
746// This default version assumes the output is a ascii
747// character array. If you set the output stream you
748// can use this class to output to a stream using the
749// << operator. The << operator can also be conveniently
750// used to set the output stream by doing something like
751//
752// cout << text_t2ascii << text_tstr << anothertext_tstr;
753//
754outconvertclass::outconvertclass ()
755{
756 input = NULL;
757 outs = NULL;
758}
759
760void outconvertclass::reset ()
761{
762 input = NULL;
763 outs = NULL;
764}
765
766void outconvertclass::setinput (text_t *theinput)
767{
768 input = theinput;
769 if (input != NULL) texthere = input->begin();
770}
771
772void outconvertclass::convert (char *output, size_t maxlen,
773 size_t &len, status_t &status)
774{
775 if (input == NULL || output == NULL)
776 {
777 status = finished;
778 return;
779 }
780
781 // don't want any funny sign conversions happening
782 unsigned char *uoutput = (unsigned char *)output;
783 text_t::iterator textend = input->end();
784 len = 0;
785 while ((len < maxlen) && (texthere != textend))
786 {
787 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
788 else {
789 // put a space or a question mark depending on what
790 // the character is. Question marks tell the user that
791 // they are missing some information.
792 if (is_unicode_space (*texthere)) *uoutput = ' ';
793 else *uoutput = '?';
794 }
795 ++uoutput;
796 ++len;
797 ++texthere;
798 }
799
800 if (texthere == textend) status = finished;
801 else status = unfinished;
802}
803
804// will convert the 16-bit string to a 8-bit stream
805// and place the result in a text_t. This method uses
806// the above convert function.
807text_t outconvertclass::convert (const text_t &t) {
808 text_t out;
809 unsigned char cbuf[256];
810 size_t cbuflen = 0;
811 status_t status = unfinished;
812
813 setinput ((text_t *)&t); // discard constant
814 while (status == unfinished) {
815 convert ((char *)cbuf, 256, cbuflen, status);
816 out.appendcarr ((char *)cbuf, cbuflen);
817 }
818
819 out.setencoding (1); // other encoding
820
821 return out;
822}
823
824
825void outconvertclass::setostream (ostream *theouts)
826{
827 outs = theouts;
828}
829
830ostream *outconvertclass::getostream ()
831{
832 return outs;
833}
834
835
836
837
838// an instance of the default outconvertclass to do simple
839// conversions
840outconvertclass text_t2ascii;
841
842
843
844// stream operators for the output class
845
846outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
847{
848 outconverter.setostream(&theouts);
849 return outconverter;
850}
851
852
853#define STREAMBUFSIZE 256
854outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
855{
856 ostream *outstream = outconverter.getostream();
857
858 if (outstream == NULL) return outconverter;
859
860 char outbuf[STREAMBUFSIZE];
861 size_t len;
862 outconvertclass::status_t status = outconvertclass::unfinished;
863
864 // assume that there is no data needing converting
865 // left in the converter
866 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
867
868 while (status == outconvertclass::unfinished)
869 {
870 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
871 if (len > 0) outstream->write(outbuf, len);
872 }
873
874 return outconverter;
875}
Note: See TracBrowser for help on using the repository browser.