source: main/tags/2.35a/gsdl/lib/text_t.cpp@ 33178

Last change on this file since 33178 was 2487, checked in by sjboddie, 23 years ago

Changes to get phind working under windows

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1/**********************************************************************
2 *
3 * text_t.cpp -- a simple 16-bit character string class
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * $Id: text_t.cpp 2487 2001-06-01 02:51:29Z sjboddie $
25 *
26 *********************************************************************/
27
28/*
29 $Log$
30 Revision 1.21 2001/06/01 02:51:28 sjboddie
31 Changes to get phind working under windows
32
33 Revision 1.20 2001/01/25 18:26:44 cs025
34 Included CORBA branch for first time
35
36 Revision 1.15.2.2 2000/04/05 10:19:38 syeates
37 added automatic conversion to allow text_t's to be <<'ed to ostreams
38
39 Revision 1.15.2.1 2000/04/04 15:02:29 cs025
40 Corba first commit
41
42 Revision 1.15 1999/10/14 22:52:39 sjboddie
43 joinchar can join using text_t string now too
44
45 Revision 1.14 1999/09/24 02:30:03 rjmcnab
46 added function has_unicode_letdig
47
48 Revision 1.13 1999/09/07 04:57:43 sjboddie
49 added gpl notice
50
51 Revision 1.12 1999/08/31 08:04:41 rjmcnab
52 Fixed a small but hard to find bug in getcarr
53
54 Revision 1.11 1999/07/01 04:05:09 rjmcnab
55 Optimised append functions slightly and added a reserve function.
56
57 Revision 1.10 1999/04/26 03:58:03 sjboddie
58 added is_number function
59
60 Revision 1.9 1999/04/06 22:17:24 rjmcnab
61 Added splits and joins using text_tset.
62
63 Revision 1.8 1999/02/28 23:14:41 rjmcnab
64
65 Added uc and lc to convert to uppercase and lowercase.
66
67 Revision 1.7 1999/02/21 22:26:39 rjmcnab
68
69 Made getint() a constant function.
70
71 Revision 1.6 1999/02/03 01:13:26 sjboddie
72
73 Got interface to handle subcollections and language subcollections -
74 committed changes made to some of the collections
75
76 Revision 1.5 1999/01/19 01:38:14 rjmcnab
77
78 Made the source more portable.
79
80 Revision 1.4 1999/01/12 01:51:00 rjmcnab
81
82 Standard header.
83
84 Revision 1.3 1999/01/08 02:33:16 rjmcnab
85
86 Added standard header to source files.
87
88 */
89
90#include "text_t.h"
91
92#if defined(GSDL_USE_OBJECTSPACE)
93# include <ospace\std\algorithm>
94#elif defined(GSDL_USE_STL_H)
95# if defined(GSDL_USE_ALGO_H)
96# include <algo.h>
97# else
98# include <algorithm.h>
99# endif
100#else
101# include <algorithm>
102#endif
103
104#ifdef HAVE_CONFIG_H
105# ifdef __WIN32__
106# include "WIN32cfg.h"
107# else
108# include "config.h"
109# endif
110#endif
111
112
113#include "unitool.h"
114
115////////////////////////////////////
116// text_t methods
117////////////////////////////////////
118
119// new stream converter ...
120ostream& operator<< (ostream &o, const text_t text)
121{
122 text_t::const_iterator ithere = text.begin();
123 text_t::const_iterator itend = text.end();
124
125 while (ithere != itend)
126 {
127 if (*ithere < 256)
128 {
129 o << (unsigned char)(*ithere);
130 }
131 else
132 {
133 // put a space or a question mark depending on what
134 // the character is. Question marks tell the user that
135 // they are missing some information.
136 if (is_unicode_space (*ithere))
137 o << ' ';
138 else
139 o << '?';
140 }
141 ithere++;
142 }
143
144 return o;
145}
146
147text_t::text_t ()
148{
149 setencoding(0);
150 clear ();
151}
152
153text_t::text_t (int i)
154{
155 setencoding(0);
156 clear ();
157 appendint (i);
158}
159
160text_t::text_t (char *s)
161{
162 setencoding(0);
163 clear ();
164 appendcstr (s);
165}
166
167
168void text_t::append (const text_t &t)
169{
170 text.insert(text.end(), t.begin(), t.end());
171 // const_iterator here, end=t.end();
172 // for (here=t.begin(); here!=end;here++)
173 // {
174 // text.push_back(*here);
175 // }
176}
177
178void text_t::appendrange (iterator first, iterator last)
179{
180 text.insert(text.end(), first, last);
181 // while (first != last)
182 // {
183 // text.push_back (*first);
184 // first++;
185 // }
186}
187
188void text_t::appendrange (const_iterator first, const_iterator last)
189{
190 text.insert(text.end(), first, last);
191 // while (first != last)
192 // {
193 // text.push_back (*first);
194 // first++;
195 // }
196}
197
198void text_t::appendint (int i)
199{
200 // deal with zeros and negatives
201 if (i == 0)
202 {
203 text.push_back('0');
204 return;
205 }
206 else if (i < 0)
207 {
208 text.push_back('-');
209 i *= -1;
210 }
211
212 // get a buffer for the conversion
213 int maxbuflen = sizeof(int)*3;
214 char *buf = new char[maxbuflen];
215 int len = 0;
216
217 // get the number in reverse
218 while (i > 0)
219 {
220 buf[len++] = '0'+ (i%10);
221 i = i/10;
222 }
223
224 // reverse the number
225 while (len > 0)
226 {
227 text.push_back(buf[--len]);
228 }
229
230 delete buf;
231}
232
233int text_t::getint () const
234{
235 int i = 0;
236 int mult = 1; // become -1 for negative numbers
237
238 const_iterator here = text.begin();
239 const_iterator end = text.end();
240
241 // do plus and minus signs
242 if (here != end)
243 {
244 if (*here == '-')
245 {
246 mult = -1;
247 here++;
248 }
249 else if (*here == '+')
250 {
251 mult = 1;
252 here++;
253 }
254 }
255
256 // deal with the number
257 while ((here != end) && (*here >= '0') && (*here <= '9'))
258 {
259 i = 10*i + (*here - '0');
260 here++;
261 }
262
263 i *= mult;
264 return i;
265}
266
267unsigned long text_t::getulong () const
268{
269 unsigned long i = 0;
270
271 const_iterator here = text.begin();
272 const_iterator end = text.end();
273
274 while ((here != end) && (*here >= '0') && (*here <= '9'))
275 {
276 i = 10*i + (*here - '0');
277 here++;
278 }
279
280 return i;
281}
282
283void text_t::appendcarr (char *s, size_type len)
284{
285 unsigned char *us = (unsigned char *)s;
286 while (len > 0)
287 {
288 text.push_back (*us); // append this character
289 us++;
290 len--;
291 }
292}
293
294void text_t::appendcstr (char *s)
295{
296 unsigned char *us = (unsigned char *)s;
297 while (*us != '\0')
298 {
299 text.push_back (*us); // append this character
300 us++;
301 }
302}
303
304
305// strings returned from getcarr and getcstr become the callers
306// responsibility and should be deallocated with "delete"
307
308char *text_t::getcarr(size_type &len) const
309{
310 unsigned char *cstr = new unsigned char[size()];
311 len = 0;
312
313 const_iterator ithere = begin();
314 const_iterator itend = end();
315 while (ithere != itend)
316 {
317 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
318 else {
319 // put a space or a question mark depending on what
320 // the character is. Question marks tell the user that
321 // they are missing some information.
322 if (is_unicode_space (*ithere)) cstr[len] = ' ';
323 else cstr[len] = '?';
324 }
325 len++;
326 ithere++;
327 }
328
329 return (char *)cstr;
330}
331
332char *text_t::getcstr() const
333{
334 unsigned char *cstr = new unsigned char[size() + 1];
335 const_iterator ithere = begin();
336 const_iterator itend = end();
337 int len = 0;
338
339 while (ithere != itend)
340 {
341 if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
342 else {
343 // put a space or a question mark depending on what
344 // the character is. Question marks tell the user that
345 // they are missing some information.
346 if (is_unicode_space (*ithere)) cstr[len] = ' ';
347 else cstr[len] = '?';
348 }
349 len++;
350 ithere++;
351 }
352
353 cstr[len] = '\0';
354
355 return (char *)cstr;
356}
357
358
359// general functions which work on text_ts
360
361// find a character within a range
362text_t::const_iterator findchar (text_t::const_iterator first, text_t::const_iterator last,
363 unsigned short c)
364{
365 while (first != last)
366 {
367 if (*first == c) break;
368 first++;
369 }
370 return first;
371}
372
373text_t::iterator findchar (text_t::iterator first, text_t::iterator last,
374 unsigned short c)
375{
376 while (first != last)
377 {
378 if (*first == c) break;
379 first++;
380 }
381 return first;
382}
383
384text_t::iterator findword (text_t::iterator first, text_t::iterator last,
385 const text_t& word)
386{
387 text_t::const_iterator word_begin = word.begin();
388 text_t::const_iterator word_end = word.end();
389
390 while (first != last)
391 {
392 text_t::iterator char_match = first;
393 text_t::const_iterator word_here = word_begin;
394 while (word_here!=word_end)
395 {
396 if (*char_match != *word_here)
397 {
398 break;
399 }
400 char_match++;
401 word_here++;
402 }
403 if (word_here==word_end)
404 {
405 return first;
406 }
407 first++;
408 }
409 return last; // get to here only if there is no match
410}
411
412// get a string up to the next delimiter (which is skipped)
413text_t::const_iterator getdelimitstr (text_t::const_iterator first,
414 text_t::const_iterator last,
415 unsigned short c, text_t &outstr)
416{
417 text_t::const_iterator here = first;
418 here = findchar (first, last, c);
419 outstr.clear();
420 outstr.appendrange (first, here);
421 if (here != last) here++; // skip c
422 return here;
423}
424
425text_t::iterator getdelimitstr (text_t::iterator first, text_t::iterator last,
426 unsigned short c, text_t &outstr)
427{
428 text_t::iterator here = first;
429 here = findchar (first, last, c);
430 outstr.clear();
431 outstr.appendrange (first, here);
432 if (here != last) here++; // skip c
433 return here;
434}
435
436// split a string with a character
437void splitchar (text_t::const_iterator first, text_t::const_iterator last,
438 unsigned short c, text_tset &outlist)
439{
440 outlist.erase(outlist.begin(), outlist.end());
441
442 text_t t;
443
444 while (first != last)
445 {
446 first = getdelimitstr (first, last, c, t);
447 outlist.insert (t);
448 }
449}
450
451void splitchar (text_t::const_iterator first, text_t::const_iterator last,
452 unsigned short c, text_tlist &outlist)
453{
454 outlist.erase(outlist.begin(), outlist.end());
455
456 text_t t;
457
458 while (first != last)
459 {
460 first = getdelimitstr (first, last, c, t);
461 outlist.push_back (t);
462 }
463}
464
465void splitchar (text_t::const_iterator first, text_t::const_iterator last,
466 unsigned short c, text_tarray &outlist)
467{
468 outlist.erase(outlist.begin(), outlist.end());
469
470 text_t t;
471
472 while (first != last)
473 {
474 first = getdelimitstr (first, last, c, t);
475 outlist.push_back (t);
476 }
477}
478
479// join a string using a character
480void joinchar (const text_tset &inlist, unsigned short c, text_t &outtext)
481{
482 outtext.clear ();
483
484 text_tset::const_iterator here = inlist.begin ();
485 text_tset::const_iterator end = inlist.end ();
486 bool first = true;
487 while (here != end)
488 {
489 if (!first) outtext.push_back (c);
490 first = false;
491 outtext += *here;
492 here++;
493 }
494}
495
496void joinchar (const text_tlist &inlist, unsigned short c, text_t &outtext)
497{
498 outtext.clear ();
499
500 text_tlist::const_iterator here = inlist.begin ();
501 text_tlist::const_iterator end = inlist.end ();
502 bool first = true;
503 while (here != end)
504 {
505 if (!first) outtext.push_back (c);
506 first = false;
507 outtext += *here;
508 here++;
509 }
510}
511
512void joinchar (const text_tarray &inlist, unsigned short c, text_t &outtext)
513{
514 outtext.clear ();
515
516 text_tarray::const_iterator here = inlist.begin ();
517 text_tarray::const_iterator end = inlist.end ();
518 bool first = true;
519 while (here != end)
520 {
521 if (!first) outtext.push_back (c);
522 first = false;
523 outtext += *here;
524 here++;
525 }
526}
527
528void joinchar (const text_tlist &inlist, text_t c, text_t &outtext)
529{
530 outtext.clear ();
531
532 text_tlist::const_iterator here = inlist.begin ();
533 text_tlist::const_iterator end = inlist.end ();
534 bool first = true;
535 while (here != end)
536 {
537 if (!first) outtext += c;
538 first = false;
539 outtext += *here;
540 here++;
541 }
542}
543
544void joinchar (const text_tset &inlist, text_t c, text_t &outtext)
545{
546 outtext.clear ();
547
548 text_tset::const_iterator here = inlist.begin ();
549 text_tset::const_iterator end = inlist.end ();
550 bool first = true;
551 while (here != end)
552 {
553 if (!first) outtext += c;
554 first = false;
555 outtext += *here;
556 here++;
557 }
558}
559
560void joinchar (const text_tarray &inlist, text_t c, text_t &outtext)
561{
562 outtext.clear ();
563
564 text_tarray::const_iterator here = inlist.begin ();
565 text_tarray::const_iterator end = inlist.end ();
566 bool first = true;
567 while (here != end)
568 {
569 if (!first) outtext += c;
570 first = false;
571 outtext += *here;
572 here++;
573 }
574}
575
576// count the occurances of a character within a range
577int countchar (text_t::const_iterator first, text_t::const_iterator last,
578 unsigned short c)
579{
580 int count = 0;
581 while (first != last) {
582 if (*first == c) count ++;
583 first ++;
584 }
585 return count;
586}
587
588// return a substring of string from first up to but not including last
589text_t substr (text_t::const_iterator first, text_t::const_iterator last) {
590
591 text_t substr;
592 while (first != last) {
593 substr.push_back(*first);
594 first ++;
595 }
596 return substr;
597}
598
599
600// convert to lowercase
601void lc (text_t::iterator first, text_t::iterator last) {
602 while (first != last) {
603 *first = unicode_tolower(*first);
604 first++;
605 }
606}
607
608// convert to uppercase
609void uc (text_t::iterator first, text_t::iterator last) {
610 while (first != last) {
611 *first = unicode_toupper(*first);
612 first++;
613 }
614}
615
616
617// checks to see if it is a number (i.e. contains only 0-9)
618bool is_number (const text_t &text) {
619
620 text_t::const_iterator here = text.begin();
621 text_t::const_iterator end = text.end();
622
623 while (here != end) {
624 if ((*here!='0') && (*here!='1') && (*here!='2') &&
625 (*here!='3') && (*here!='4') && (*here!='5') &&
626 (*here!='6') && (*here!='7') && (*here!='8') &&
627 (*here!='9')) return false;
628 here ++;
629 }
630 return true;
631}
632
633
634// checks to see if the text has any letters or digits
635bool has_unicode_letdig (const text_t &text) {
636 if (text.empty()) return false;
637
638 text_t::const_iterator here = text.begin();
639 text_t::const_iterator end = text.end();
640 while (here != end) {
641 if (is_unicode_letdig (*here)) return true;
642 here++;
643 }
644
645 return false;
646}
647
648
649
650////////////////////////////////////
651// convertclass methods
652////////////////////////////////////
653
654// conversion classes used for getting information in to and out of
655// the text_t class.
656
657convertclass::convertclass ()
658{
659 // nothing to do
660}
661
662void convertclass::reset ()
663{
664 // nothing to do
665}
666
667
668////////////////////////////////////
669// inconvertclass methods
670////////////////////////////////////
671
672// convert from a char stream to the text_t class
673// the default version assumes the input is a ascii
674// character array
675
676inconvertclass::inconvertclass ()
677{
678 start = NULL;
679 len = 0;
680}
681
682
683void inconvertclass::reset ()
684{
685 start = NULL;
686 len = 0;
687}
688
689void inconvertclass::setinput (char *thestart, size_t thelen)
690{
691 start = thestart;
692 len = thelen;
693}
694
695void inconvertclass::convert (text_t &output, status_t &status)
696{
697 output.clear();
698
699 if (start == NULL || len == 0)
700 {
701 status = finished;
702 return;
703 }
704
705 // don't want any funny sign conversions happening
706 unsigned char *here = (unsigned char *)start;
707 while (len > 0)
708 {
709 output.push_back (*here); // append this character
710 ++here;
711 --len;
712 }
713
714 start = (char *)here; // save current position
715 status = finished;
716}
717
718// will treat the text_t as a 8-bit string and convert
719// it to a 16-bit string using the about convert method.
720text_t inconvertclass::convert (const text_t &t) {
721 text_t out;
722 text_t tmpout;
723 status_t status;
724 text_t::const_iterator here = t.begin();
725 text_t::const_iterator end = t.end();
726 unsigned char cbuf[256];
727 size_t cbuflen = 0;
728
729 while (here != end) {
730 while (here != end && cbuflen < 256) {
731 cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
732 here++;
733 }
734
735 if (cbuflen > 0) {
736 setinput ((char *)cbuf, cbuflen);
737 status = unfinished;
738 while (status == unfinished) {
739 convert (tmpout, status);
740 out += tmpout;
741 }
742 cbuflen = 0;
743 }
744 }
745
746 out.setencoding (0); // unicode
747
748 return out;
749}
750
751// an instance of the default inconvertclass to do simple
752// conversions. Note that any functions that use this are
753// not reentrant. If a function needs to be reentrant it
754// should declare its own instance.
755inconvertclass ascii2text_t;
756
757
758////////////////////////////////////
759// outconvertclass methods
760////////////////////////////////////
761
762// Convert from a text_t class to a char stream
763// This default version assumes the output is a ascii
764// character array. If you set the output stream you
765// can use this class to output to a stream using the
766// << operator. The << operator can also be conveniently
767// used to set the output stream by doing something like
768//
769// cout << text_t2ascii << text_tstr << anothertext_tstr;
770//
771outconvertclass::outconvertclass ()
772{
773 input = NULL;
774 outs = NULL;
775}
776
777void outconvertclass::reset ()
778{
779 input = NULL;
780 outs = NULL;
781}
782
783void outconvertclass::setinput (text_t *theinput)
784{
785 input = theinput;
786 if (input != NULL) texthere = input->begin();
787}
788
789void outconvertclass::convert (char *output, size_t maxlen,
790 size_t &len, status_t &status)
791{
792 if (input == NULL || output == NULL)
793 {
794 status = finished;
795 return;
796 }
797
798 // don't want any funny sign conversions happening
799 unsigned char *uoutput = (unsigned char *)output;
800 text_t::iterator textend = input->end();
801 len = 0;
802 while ((len < maxlen) && (texthere != textend))
803 {
804 if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
805 else {
806 // put a space or a question mark depending on what
807 // the character is. Question marks tell the user that
808 // they are missing some information.
809 if (is_unicode_space (*texthere)) *uoutput = ' ';
810 else *uoutput = '?';
811 }
812 ++uoutput;
813 ++len;
814 ++texthere;
815 }
816
817 if (texthere == textend) status = finished;
818 else status = unfinished;
819}
820
821// will convert the 16-bit string to a 8-bit stream
822// and place the result in a text_t. This method uses
823// the above convert function.
824text_t outconvertclass::convert (const text_t &t) {
825 text_t out;
826 unsigned char cbuf[256];
827 size_t cbuflen = 0;
828 status_t status = unfinished;
829
830 setinput ((text_t *)&t); // discard constant
831 while (status == unfinished) {
832 convert ((char *)cbuf, 256, cbuflen, status);
833 out.appendcarr ((char *)cbuf, cbuflen);
834 }
835
836 out.setencoding (1); // other encoding
837
838 return out;
839}
840
841
842void outconvertclass::setostream (ostream *theouts)
843{
844 outs = theouts;
845}
846
847ostream *outconvertclass::getostream ()
848{
849 return outs;
850}
851
852
853
854
855// an instance of the default outconvertclass to do simple
856// conversions
857outconvertclass text_t2ascii;
858
859
860
861// stream operators for the output class
862
863outconvertclass &operator<< (ostream &theouts, outconvertclass &outconverter)
864{
865 outconverter.setostream(&theouts);
866 return outconverter;
867}
868
869
870#define STREAMBUFSIZE 256
871outconvertclass &operator<< (outconvertclass &outconverter, const text_t &t)
872{
873 ostream *outstream = outconverter.getostream();
874
875 if (outstream == NULL) return outconverter;
876
877 char outbuf[STREAMBUFSIZE];
878 size_t len;
879 outconvertclass::status_t status = outconvertclass::unfinished;
880
881 // assume that there is no data needing converting
882 // left in the converter
883 outconverter.setinput ((text_t *)(&t)); // note the const -> nonconst conversion
884
885 while (status == outconvertclass::unfinished)
886 {
887 outconverter.convert (outbuf, STREAMBUFSIZE, len, status);
888 if (len > 0) outstream->write(outbuf, len);
889 }
890
891 return outconverter;
892}
Note: See TracBrowser for help on using the repository browser.