source: trunk/greenorg/lib/text_t.cpp@ 13640

Last change on this file since 13640 was 5503, checked in by sjboddie, 21 years ago

* empty log message *

  • Property svn:keywords set to Author Date Id Revision
File size: 19.2 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 5503 2003-09-12 04:53:13Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.1 2003/09/12 04:52:19 sjboddie
31 *** empty log message ***
32
33 Revision 1.21 2001/06/01 02:51:28 sjboddie
34 Changes to get phind working under windows
35
36 Revision 1.20 2001/01/25 18:26:44 cs025
37 Included CORBA branch for first time
38
39 Revision 1.15.2.2 2000/04/05 10:19:38 syeates
40 added automatic conversion to allow text_t's to be <<'ed to ostreams
41
42 Revision 1.15.2.1 2000/04/04 15:02:29 cs025
43 Corba first commit
44
45 Revision 1.15 1999/10/14 22:52:39 sjboddie
46 joinchar can join using text_t string now too
47
48 Revision 1.14 1999/09/24 02:30:03 rjmcnab
49 added function has_unicode_letdig
50
51 Revision 1.13 1999/09/07 04:57:43 sjboddie
52 added gpl notice
53
54 Revision 1.12 1999/08/31 08:04:41 rjmcnab
55 Fixed a small but hard to find bug in getcarr
56
57 Revision 1.11 1999/07/01 04:05:09 rjmcnab
58 Optimised append functions slightly and added a reserve function.
59
60 Revision 1.10 1999/04/26 03:58:03 sjboddie
61 added is_number function
62
63 Revision 1.9 1999/04/06 22:17:24 rjmcnab
64 Added splits and joins using text_tset.
65
66 Revision 1.8 1999/02/28 23:14:41 rjmcnab
67
68 Added uc and lc to convert to uppercase and lowercase.
69
70 Revision 1.7 1999/02/21 22:26:39 rjmcnab
71
72 Made getint() a constant function.
73
74 Revision 1.6 1999/02/03 01:13:26 sjboddie
75
76 Got interface to handle subcollections and language subcollections -
77 committed changes made to some of the collections
78
79 Revision 1.5 1999/01/19 01:38:14 rjmcnab
80
81 Made the source more portable.
82
83 Revision 1.4 1999/01/12 01:51:00 rjmcnab
84
85 Standard header.
86
87 Revision 1.3 1999/01/08 02:33:16 rjmcnab
88
89 Added standard header to source files.
90
91 */
92
93#include "text_t.h"
94
95#if defined(GSDL_USE_OBJECTSPACE)
96# include <ospace\std\algorithm>
97#elif defined(GSDL_USE_STL_H)
98# if defined(GSDL_USE_ALGO_H)
99# include <algo.h>
100# else
101# include <algorithm.h>
102# endif
103#else
104# include <algorithm>
105#endif
106
107#ifdef HAVE_CONFIG_H
108# ifdef __WIN32__
109# include "WIN32cfg.h"
110# else
111# include "config.h"
112# endif
113#endif
114
115
116#include "unitool.h"
117
118////////////////////////////////////
119// text_t methods
120////////////////////////////////////
121
122// new stream converter ...
123ostream& operator<< (ostream &o, const text_t text)
124{
125 text_t::const_iterator ithere = text.begin();
126 text_t::const_iterator itend = text.end();
127
128 while (ithere != itend)
129 {
130 if (*ithere < 256)
131 {
132 o << (unsigned char)(*ithere);
133 }
134 else
135 {
136 // put a space or a question mark depending on what
137 // the character is. Question marks tell the user that
138 // they are missing some information.
139 if (is_unicode_space (*ithere))
140 o << ' ';
141 else
142 o << '?';
143 }
144 ithere++;
145 }
146
147 return o;
148}
149
150text_t::text_t ()
151{
152 setencoding(0);
153 clear ();
154}
155
156text_t::text_t (int i)
157{
158 setencoding(0);
159 clear ();
160 appendint (i);
161}
162
163text_t::text_t (char *s)
164{
165 setencoding(0);
166 clear ();
167 appendcstr (s);
168}
169
170
171void text_t::append (const text_t &t)
172{
173 text.insert(text.end(), t.begin(), t.end());
174 // const_iterator here, end=t.end();
175 // for (here=t.begin(); here!=end;here++)
176 // {
177 // text.push_back(*here);
178 // }
179}
180
181void text_t::appendrange (iterator first, iterator last)
182{
183 text.insert(text.end(), first, last);
184 // while (first != last)
185 // {
186 // text.push_back (*first);
187 // first++;
188 // }
189}
190
191void text_t::appendrange (const_iterator first, const_iterator last)
192{
193 text.insert(text.end(), first, last);
194 // while (first != last)
195 // {
196 // text.push_back (*first);
197 // first++;
198 // }
199}
200
201void text_t::appendint (int i)
202{
203 // deal with zeros and negatives
204 if (i == 0)
205 {
206 text.push_back('0');
207 return;
208 }
209 else if (i < 0)
210 {
211 text.push_back('-');
212 i *= -1;
213 }
214
215 // get a buffer for the conversion
216 int maxbuflen = sizeof(int)*3;
217 char *buf = new char[maxbuflen];
218 int len = 0;
219
220 // get the number in reverse
221 while (i > 0)
222 {
223 buf[len++] = '0'+ (i%10);
224 i = i/10;
225 }
226
227 // reverse the number
228 while (len > 0)
229 {
230 text.push_back(buf[--len]);
231 }
232
233 delete buf;
234}
235
236int text_t::getint () const
237{
238 int i = 0;
239 int mult = 1; // become -1 for negative numbers
240
241 const_iterator here = text.begin();
242 const_iterator end = text.end();
243
244 // do plus and minus signs
245 if (here != end)
246 {
247 if (*here == '-')
248 {
249 mult = -1;
250 here++;
251 }
252 else if (*here == '+')
253 {
254 mult = 1;
255 here++;
256 }
257 }
258
259 // deal with the number
260 while ((here != end) && (*here >= '0') && (*here <= '9'))
261 {
262 i = 10*i + (*here - '0');
263 here++;
264 }
265
266 i *= mult;
267 return i;
268}
269
270unsigned long text_t::getulong () const
271{
272 unsigned long i = 0;
273
274 const_iterator here = text.begin();
275 const_iterator end = text.end();
276
277 while ((here != end) && (*here >= '0') && (*here <= '9'))
278 {
279 i = 10*i + (*here - '0');
280 here++;
281 }
282
283 return i;
284}
285
286void text_t::appendcarr (char *s, size_type len)
287{
288 unsigned char *us = (unsigned char *)s;
289 while (len > 0)
290 {
291 text.push_back (*us); // append this character
292 us++;
293 len--;
294 }
295}
296
297void text_t::appendcstr (char *s)
298{
299 unsigned char *us = (unsigned char *)s;
300 while (*us != '\0')
301 {
302 text.push_back (*us); // append this character
303 us++;
304 }
305}
306
307
308// strings returned from getcarr and getcstr become the callers
309// responsibility and should be deallocated with "delete"
310
311char *text_t::getcarr(size_type &len) const
312{
313 unsigned char *cstr = new unsigned char[size()];
314 len = 0;
315
316 const_iterator ithere = begin();
317 const_iterator itend = end();
318 while (ithere != itend)
319 {
320 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
321 else {
322 // put a space or a question mark depending on what
323 // the character is. Question marks tell the user that
324 // they are missing some information.
325 if (is_unicode_space (*ithere)) cstr[len] = ' ';
326 else cstr[len] = '?';
327 }
328 len++;
329 ithere++;
330 }
331
332 return (char *)cstr;
333}
334
335char *text_t::getcstr() const
336{
337 unsigned char *cstr = new unsigned char[size() + 1];
338 const_iterator ithere = begin();
339 const_iterator itend = end();
340 int len = 0;
341
342 while (ithere != itend)
343 {
344 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
345 else {
346 // put a space or a question mark depending on what
347 // the character is. Question marks tell the user that
348 // they are missing some information.
349 if (is_unicode_space (*ithere)) cstr[len] = ' ';
350 else cstr[len] = '?';
351 }
352 len++;
353 ithere++;
354 }
355
356 cstr[len] = '\0';
357
358 return (char *)cstr;
359}
360
361
362// general functions which work on text_ts
363
364// find a character within a range
365text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
366 unsigned short c)
367{
368 while (first != last)
369 {
370 if (*first == c) break;
371 first++;
372 }
373 return first;
374}
375
376text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
377 unsigned short c)
378{
379 while (first != last)
380 {
381 if (*first == c) break;
382 first++;
383 }
384 return first;
385}
386
387text_t::iterator findword (text_t::iterator first, text_t::iterator last,
388 const text_t& word)
389{
390 text_t::const_iterator word_begin = word.begin();
391 text_t::const_iterator word_end = word.end();
392
393 while (first != last)
394 {
395 text_t::iterator char_match = first;
396 text_t::const_iterator word_here = word_begin;
397 while (word_here!=word_end)
398 {
399 if (*char_match != *word_here)
400 {
401 break;
402 }
403 char_match++;
404 word_here++;
405 }
406 if (word_here==word_end)
407 {
408 return first;
409 }
410 first++;
411 }
412 return last; // get to here only if there is no match
413}
414
415// get a string up to the next delimiter (which is skipped)
416text_t::const_iterator getdelimitstr (text_t::const_iterator first,
417 text_t::const_iterator last,
418 unsigned short c, text_t &outstr)
419{
420 text_t::const_iterator here = first;
421 here = findchar (first, last, c);
422 outstr.clear();
423 outstr.appendrange (first, here);
424 if (here != last) here++; // skip c
425 return here;
426}
427
428text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
429 unsigned short c, text_t &outstr)
430{
431 text_t::iterator here = first;
432 here = findchar (first, last, c);
433 outstr.clear();
434 outstr.appendrange (first, here);
435 if (here != last) here++; // skip c
436 return here;
437}
438
439// split a string with a character
440void splitchar (text_t::const_iterator first, text_t::const_iterator last,
441 unsigned short c, text_tset &outlist)
442{
443 outlist.erase(outlist.begin(), outlist.end());
444
445 text_t t;
446
447 while (first != last)
448 {
449 first = getdelimitstr (first, last, c, t);
450 outlist.insert (t);
451 }
452}
453
454void splitchar (text_t::const_iterator first, text_t::const_iterator last,
455 unsigned short c, text_tlist &outlist)
456{
457 outlist.erase(outlist.begin(), outlist.end());
458
459 text_t t;
460
461 while (first != last)
462 {
463 first = getdelimitstr (first, last, c, t);
464 outlist.push_back (t);
465 }
466}
467
468void splitchar (text_t::const_iterator first, text_t::const_iterator last,
469 unsigned short c, text_tarray &outlist)
470{
471 outlist.erase(outlist.begin(), outlist.end());
472
473 text_t t;
474
475 while (first != last)
476 {
477 first = getdelimitstr (first, last, c, t);
478 outlist.push_back (t);
479 }
480}
481
482// join a string using a character
483void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
484{
485 outtext.clear ();
486
487 text_tset::const_iterator here = inlist.begin ();
488 text_tset::const_iterator end = inlist.end ();
489 bool first = true;
490 while (here != end)
491 {
492 if (!first) outtext.push_back (c);
493 first = false;
494 outtext += *here;
495 here++;
496 }
497}
498
499void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
500{
501 outtext.clear ();
502
503 text_tlist::const_iterator here = inlist.begin ();
504 text_tlist::const_iterator end = inlist.end ();
505 bool first = true;
506 while (here != end)
507 {
508 if (!first) outtext.push_back (c);
509 first = false;
510 outtext += *here;
511 here++;
512 }
513}
514
515void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
516{
517 outtext.clear ();
518
519 text_tarray::const_iterator here = inlist.begin ();
520 text_tarray::const_iterator end = inlist.end ();
521 bool first = true;
522 while (here != end)
523 {
524 if (!first) outtext.push_back (c);
525 first = false;
526 outtext += *here;
527 here++;
528 }
529}
530
531void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
532{
533 outtext.clear ();
534
535 text_tlist::const_iterator here = inlist.begin ();
536 text_tlist::const_iterator end = inlist.end ();
537 bool first = true;
538 while (here != end)
539 {
540 if (!first) outtext += c;
541 first = false;
542 outtext += *here;
543 here++;
544 }
545}
546
547void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
548{
549 outtext.clear ();
550
551 text_tset::const_iterator here = inlist.begin ();
552 text_tset::const_iterator end = inlist.end ();
553 bool first = true;
554 while (here != end)
555 {
556 if (!first) outtext += c;
557 first = false;
558 outtext += *here;
559 here++;
560 }
561}
562
563void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
564{
565 outtext.clear ();
566
567 text_tarray::const_iterator here = inlist.begin ();
568 text_tarray::const_iterator end = inlist.end ();
569 bool first = true;
570 while (here != end)
571 {
572 if (!first) outtext += c;
573 first = false;
574 outtext += *here;
575 here++;
576 }
577}
578
579// count the occurances of a character within a range
580int countchar (text_t::const_iterator first, text_t::const_iterator last,
581 unsigned short c)
582{
583 int count = 0;
584 while (first != last) {
585 if (*first == c) count ++;
586 first ++;
587 }
588 return count;
589}
590
591// return a substring of string from first up to but not including last
592text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
593
594 text_t substr;
595 while (first != last) {
596 substr.push_back(*first);
597 first ++;
598 }
599 return substr;
600}
601
602
603// convert to lowercase
604void lc (text_t::iterator first, text_t::iterator last) {
605 while (first != last) {
606 *first = unicode_tolower(*first);
607 first++;
608 }
609}
610
611// convert to uppercase
612void uc (text_t::iterator first, text_t::iterator last) {
613 while (first != last) {
614 *first = unicode_toupper(*first);
615 first++;
616 }
617}
618
619
620// checks to see if it is a number (i.e. contains only 0-9)
621bool is_number (const text_t &text) {
622
623 text_t::const_iterator here = text.begin();
624 text_t::const_iterator end = text.end();
625
626 while (here != end) {
627 if ((*here!='0') && (*here!='1') && (*here!='2') &&
628 (*here!='3') && (*here!='4') && (*here!='5') &&
629 (*here!='6') && (*here!='7') && (*here!='8') &&
630 (*here!='9')) return false;
631 here ++;
632 }
633 return true;
634}
635
636
637// checks to see if the text has any letters or digits
638bool has_unicode_letdig (const text_t &text) {
639 if (text.empty()) return false;
640
641 text_t::const_iterator here = text.begin();
642 text_t::const_iterator end = text.end();
643 while (here != end) {
644 if (is_unicode_letdig (*here)) return true;
645 here++;
646 }
647
648 return false;
649}
650
651
652
653////////////////////////////////////
654// convertclass methods
655////////////////////////////////////
656
657// conversion classes used for getting information in to and out of
658// the text_t class.
659
660convertclass::convertclass ()
661{
662 // nothing to do
663}
664
665void convertclass::reset ()
666{
667 // nothing to do
668}
669
670
671////////////////////////////////////
672// inconvertclass methods
673////////////////////////////////////
674
675// convert from a char stream to the text_t class
676// the default version assumes the input is a ascii
677// character array
678
679inconvertclass::inconvertclass ()
680{
681 start = NULL;
682 len = 0;
683}
684
685
686void inconvertclass::reset ()
687{
688 start = NULL;
689 len = 0;
690}
691
692void inconvertclass::setinput (char *thestart, size_t thelen)
693{
694 start = thestart;
695 len = thelen;
696}
697
698void inconvertclass::convert (text_t &output, status_t &status)
699{
700 output.clear();
701
702 if (start == NULL || len == 0)
703 {
704 status = finished;
705 return;
706 }
707
708 // don't want any funny sign conversions happening
709 unsigned char *here = (unsigned char *)start;
710 while (len > 0)
711 {
712 output.push_back (*here); // append this character
713 ++here;
714 --len;
715 }
716
717 start = (char *)here; // save current position
718 status = finished;
719}
720
721// will treat the text_t as a 8-bit string and convert
722// it to a 16-bit string using the about convert method.
723text_t inconvertclass::convert (const text_t &t) {
724 text_t out;
725 text_t tmpout;
726 status_t status;
727 text_t::const_iterator here = t.begin();
728 text_t::const_iterator end = t.end();
729 unsigned char cbuf[256];
730 size_t cbuflen = 0;
731
732 while (here != end) {
733 while (here != end && cbuflen < 256) {
734 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
735 here++;
736 }
737
738 if (cbuflen > 0) {
739 setinput ((char *)cbuf, cbuflen);
740 status = unfinished;
741 while (status == unfinished) {
742 convert (tmpout, status);
743 out += tmpout;
744 }
745 cbuflen = 0;
746 }
747 }
748
749 out.setencoding (0); // unicode
750
751 return out;
752}
753
754// an instance of the default inconvertclass to do simple
755// conversions. Note that any functions that use this are
756// not reentrant. If a function needs to be reentrant it
757// should declare its own instance.
758inconvertclass ascii2text_t;
759
760
761////////////////////////////////////
762// outconvertclass methods
763////////////////////////////////////
764
765// Convert from a text_t class to a char stream
766// This default version assumes the output is a ascii
767// character array. If you set the output stream you
768// can use this class to output to a stream using the
769// << operator. The << operator can also be conveniently
770// used to set the output stream by doing something like
771//
772// cout << text_t2ascii << text_tstr << anothertext_tstr;
773//
774outconvertclass::outconvertclass ()
775{
776 input = NULL;
777 outs = NULL;
778}
779
780void outconvertclass::reset ()
781{
782 input = NULL;
783 outs = NULL;
784}
785
786void outconvertclass::setinput (text_t *theinput)
787{
788 input = theinput;
789 if (input != NULL) texthere = input->begin();
790}
791
792void outconvertclass::convert (char *output, size_t maxlen,
793 size_t &len, status_t &status)
794{
795 if (input == NULL || output == NULL)
796 {
797 status = finished;
798 return;
799 }
800
801 // don't want any funny sign conversions happening
802 unsigned char *uoutput = (unsigned char *)output;
803 text_t::iterator textend = input->end();
804 len = 0;
805 while ((len < maxlen) && (texthere != textend))
806 {
807 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
808 else {
809 // put a space or a question mark depending on what
810 // the character is. Question marks tell the user that
811 // they are missing some information.
812 if (is_unicode_space (*texthere)) *uoutput = ' ';
813 else *uoutput = '?';
814 }
815 ++uoutput;
816 ++len;
817 ++texthere;
818 }
819
820 if (texthere == textend) status = finished;
821 else status = unfinished;
822}
823
824// will convert the 16-bit string to a 8-bit stream
825// and place the result in a text_t. This method uses
826// the above convert function.
827text_t outconvertclass::convert (const text_t &t) {
828 text_t out;
829 unsigned char cbuf[256];
830 size_t cbuflen = 0;
831 status_t status = unfinished;
832
833 setinput ((text_t *)&t); // discard constant
834 while (status == unfinished) {
835 convert ((char *)cbuf, 256, cbuflen, status);
836 out.appendcarr ((char *)cbuf, cbuflen);
837 }
838
839 out.setencoding (1); // other encoding
840
841 return out;
842}
843
844
845void outconvertclass::setostream (ostream *theouts)
846{
847 outs = theouts;
848}
849
850ostream *outconvertclass::getostream ()
851{
852 return outs;
853}
854
855
856
857
858// an instance of the default outconvertclass to do simple
859// conversions
860outconvertclass text_t2ascii;
861
862
863
864// stream operators for the output class
865
866outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
867{
868 outconverter.setostream(&theouts);
869 return outconverter;
870}
871
872
873#define STREAMBUFSIZE 256
874outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
875{
876 ostream *outstream = outconverter.getostream();
877
878 if (outstream == NULL) return outconverter;
879
880 char outbuf[STREAMBUFSIZE];
881 size_t len;
882 outconvertclass::status_t status = outconvertclass::unfinished;
883
884 // assume that there is no data needing converting
885 // left in the converter
886 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
887
888 while (status == outconvertclass::unfinished)
889 {
890 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
891 if (len > 0) outstream->write(outbuf, len);
892 }
893
894 return outconverter;
895}
Note: See TracBrowser for help on using the repository browser.