/********************************************************************** * * gsdlunicode.h -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #ifndef GSDLUNICODE_H #define GSDLUNICODE_H #include "text_t.h" // converts a unicode encode text_t string to a utf-8 // encoded text_t string text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end); inline text_t to_utf8 (const text_t &in) {return to_utf8 (in.begin(), in.end());} // converts a utf-8 encoded text_t string to a unicode // encoded text_t string text_t to_uni (const text_t &in); #define MAXUTF8CHARLEN 3 // convert from a utf-8 char stream to the text_t class class utf8inconvertclass : public inconvertclass { public: utf8inconvertclass(); void reset (); void convert (text_t &output, status_t &status); protected: // buffer to hold unconverted characters in a stream unsigned char utf8buf[MAXUTF8CHARLEN]; size_t utf8buflen; // returns the length that the current contents of the // utf8buf should be size_t getutf8charlen (); }; // This class provides the option of removing zero width // spaces (U+200B) during the output. By default this // option is turned off. The functionality is actually // implemented by the sub-classes, this class just provides // the framework for these classes. // // Note: by convention reset() should not reset the rzws flag. class rzwsoutconvertclass : public outconvertclass { public: rzwsoutconvertclass () {rzws = 0;}; void set_rzws (int new_rzws) {rzws = new_rzws;}; protected: int rzws; }; // Convert from a text_t class to a utf-8 char stream class utf8outconvertclass : public rzwsoutconvertclass { public: utf8outconvertclass () {utf8buflen=0; utf8bufhere=0;}; void reset (); // note that convert does not null-terminate the // output array of characters void convert (char *output, size_t maxlen, size_t &len, status_t &status); protected: unsigned char utf8buf[MAXUTF8CHARLEN]; size_t utf8buflen; size_t utf8bufhere; }; // mapdata_t is used by mapconvert to hold the map file data class mapdata_t { public: mapdata_t(); bool loaded; unsigned short *ptrs[256]; }; // mapconvert is used in situations where conversion is best // done using a map file. The mapfile should reside in // gsdlhome/unicode. class mapconvert { public: mapconvert (); ~mapconvert () {unloadmapfile();}; // setmapfile will cause loadmapfile to be called when conversion is // needed bool setmapfile (const text_t &themapfile, unsigned short theabsentc); // loadmapfile should be called before any conversion is done bool loadmapfile (const text_t &themapfile, unsigned short theabsentc); void unloadmapfile (); unsigned short convert (unsigned short c); // note that this version of convert has different semantics to // the convertclass version. text_t convert (const text_t &instr); protected: text_t mapfile; unsigned short absentc; mapdata_t mapdata; }; #define MAXMAPCHARLEN 2 // convert from a gb char stream to the unicode text_t class class mapinconvertclass : public inconvertclass { public: mapinconvertclass(); virtual ~mapinconvertclass() {}; // setmapfile will cause loadmapfile to be called when conversion is needed bool setmapfile (const text_t &themapfile, unsigned short theabsentc) { return converter.setmapfile (themapfile, theabsentc); }; // loadmapfile should be called before any conversion takes // place bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) { return converter.loadmapfile (themapfile, theabsentc); }; void set_multibyte (int new_multibyte) {multibyte = new_multibyte;}; void reset (); void convert (text_t &output, status_t &status); protected: // buffer to hold unconverted characters in a stream unsigned char mapbuf[MAXMAPCHARLEN]; size_t mapbuflen; int multibyte; // note: multiple instances of mapinconvert class are expensive // as each will have its own copy of the map file data. This // could be reduced by making map2unimap static, but then it // wouldn't be thread safe. mapconvert converter; // returns the length that the current contents of the // mapbuf should be inline size_t getmapcharlen () { if (mapbuflen == 0) return 0; if (mapbuf[0] < 0x80) return 1; if (!multibyte) return 1; return 2; } }; // Convert from a text_t class to a map char stream class mapoutconvertclass : public rzwsoutconvertclass { public: mapoutconvertclass (); virtual ~mapoutconvertclass() {}; // setmapfile will cause loadmapfile to be called when conversion is needed bool setmapfile (const text_t &themapfile, unsigned short theabsentc) { return converter.setmapfile (themapfile, theabsentc); }; // loadmapfile should be called before any conversion takes // place bool loadmapfile (const text_t &themapfile, unsigned short theabsentc) { return converter.loadmapfile (themapfile, theabsentc); }; void set_multibyte (int new_multibyte) {multibyte = new_multibyte;}; void reset (); void convert (char *output, size_t maxlen, size_t &len, status_t &status); protected: unsigned char mapbuf[MAXMAPCHARLEN]; size_t mapbuflen; size_t mapbufhere; int multibyte; mapconvert converter; }; // Simple input and output converter classes for use with 8 bit encodings // using simple textual map files. Map files should contain (at least) two // tab-separated fields. The first field is the mapped value and the second // field is the unicode value. struct ltus_t { bool operator()(const unsigned short &t1, const unsigned short &t2) const { return t1 < t2; } }; class simplemapconvert { public: simplemapconvert () {absentc=0; loaded=false;} unsigned short convert (unsigned short c, bool in); void setmapfile (const text_t &themapfile) {mapfile = themapfile;} protected: bool loadmapfile (bool in); map mapping; bool loaded; text_t mapfile; unsigned short absentc; }; class simplemapinconvertclass : public inconvertclass { public: virtual ~simplemapinconvertclass () {} void convert (text_t &output, status_t &status); void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);} protected: simplemapconvert converter; }; class simplemapoutconvertclass : public rzwsoutconvertclass { public: virtual ~simplemapoutconvertclass () {} void convert (char *output, size_t maxlen, size_t &len, status_t &status); void setmapfile (const text_t &themapfile) {converter.setmapfile(themapfile);} protected: simplemapconvert converter; }; #endif