/********************************************************************** * * gsdlunicode.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "gsdlunicode.h" // unitool is currently in mg, if mg is not being used it should // be moved into GSDLHOME/lib #include "unitool.h" #include "fileutil.h" #include #if defined(GSDL_USE_OBJECTSPACE) # include # include #elif defined(GSDL_USE_IOS_H) # include # include #else # include # include #endif // converts a unicode encode text_t string to a utf-8 // encoded text_t string text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) { text_t out; unsigned char thischar[MAXUTF8CHARLEN]; int i, charlen; while (here != end) { charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]); for (i=0; i 0) { out.push_back(unichar); here += charlen; } delete in_cstr; return out; } utf8inconvertclass::utf8inconvertclass () { utf8buflen = 0; } void utf8inconvertclass::reset () { start = NULL; len = 0; utf8buflen=0; } void utf8inconvertclass::convert (text_t &output, status_t &status) { output.clear(); output.reserve (len/3); if (start == NULL || len == 0) { if (utf8buflen == 0) status = finished; else status = stopped; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; unsigned char *end = here+len-1; unsigned short c; size_t realcharlen; size_t charlen = getutf8charlen (); while (len > 0) { if (charlen == 0) { // start parsing a new character utf8buflen = 0; // fast common case while (len > 3) { realcharlen = parse_utf8_char (here, end, &c); output.push_back (c); here += realcharlen; len -= realcharlen; } utf8buf[utf8buflen++] = *here; ++here; --len; charlen = getutf8charlen (); } else if (utf8buflen < charlen) { // assumes charlen is always less than MAXUTF8CHARLEN utf8buf[utf8buflen++] = *here; ++here; --len; } if (utf8buflen == charlen) { // got a complete character realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c); output.push_back (c); // move any unparsed characters. If an error occurred some of // the characters might be unused. int i; int diff = utf8buflen - realcharlen; for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff]; utf8buflen = diff; charlen = getutf8charlen (); } } start = (char *)here; // save current position if (utf8buflen == 0) status = finished; else status = stopped; } // returns the length that the current contents of the // utf8buf should be size_t utf8inconvertclass::getutf8charlen () { if (utf8buflen == 0) return 0; // one byte character if (utf8buf[0] < 0x80) return 1; // error, is not the start of a utf-8 character if (utf8buf[0] < 0xc0) return 1; // two bute character if (utf8buf[0] < 0xe0) return 2; // three byte character if (utf8buf[0] < 0xf0) return 3; // error, character too long for unicode return 1; } void utf8outconvertclass::reset () { input = NULL; outs = NULL; utf8buflen = 0; utf8bufhere = 0; } // note that convert does not null-terminate the // output array of characters void utf8outconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { if (input == NULL || output == NULL) { if (utf8buflen == 0) status = finished; else status = unfinished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while (len < maxlen) { // empty the contents of the internal buffer if (utf8buflen > 0) { while (len < maxlen && utf8bufhere < utf8buflen) { *uoutput = utf8buf[utf8bufhere]; uoutput++; len++; utf8bufhere++; } if (utf8bufhere == utf8buflen) { utf8bufhere = 0; utf8buflen = 0; } } // fill up the buffer with the next character if (utf8buflen == 0) { if (texthere == textend) break; // finished! if (!rzws || (*texthere != 0x200b)) utf8buflen = output_utf8_char (*texthere, utf8buf, &utf8buf[MAXUTF8CHARLEN-1]); texthere++; utf8bufhere = 0; } } if (texthere == textend && utf8buflen == 0) status = finished; else status = unfinished; } mapdata_t::mapdata_t () { int i; // reset all the map ptrs to be NULL for (i=0; i<256; i++) { ptrs[i] = (unsigned short *)NULL; } // say nothing has been loaded loaded = false; } mapconvert::mapconvert () { absentc = 0; } // setmapfile will cause loadmapfile to be called when conversion is // needed bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) { // check to see if the mapfile has been already loaded if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true; unloadmapfile (); mapfile = themapfile; absentc = theabsentc; return true; } // loadmapfile should be called before any conversion is done bool mapconvert::loadmapfile (const text_t &themapfile, unsigned short theabsentc) { FILE *mapfilein = (FILE *)NULL; // check to see if the mapfile has been already loaded if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true; unloadmapfile (); mapfile = themapfile; absentc = theabsentc; // open the map file char *cfilename = mapfile.getcstr(); if (cfilename == (char *)NULL) return false; mapfilein = fopen(cfilename, "rb"); delete cfilename; if (mapfilein == (FILE *)NULL) return false; unsigned char c, n1, n2; unsigned short *arrptr; int i; c = fgetc (mapfilein); while (!feof (mapfilein)) { if (mapdata.ptrs[c] == (unsigned short *)NULL) { // allocate a new array arrptr = new unsigned short[256]; mapdata.ptrs[c] = arrptr; } else arrptr = mapdata.ptrs[c]; // clear the array for (i=0; i<256; i++) arrptr[i] = 0; // read in this block n1 = fgetc (mapfilein); n2 = fgetc (mapfilein); i=0; while (!feof (mapfilein)) { arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2; i++; if (i >= 256) break; n1 = fgetc (mapfilein); n2 = fgetc (mapfilein); } c = fgetc (mapfilein); } mapdata.loaded = true; return true; } void mapconvert::unloadmapfile () { if (!mapdata.loaded) return; int i; for (i=0; i<256; i++) { if (mapdata.ptrs[i] != (unsigned short *)NULL) { delete [] mapdata.ptrs[i]; mapdata.ptrs[i] = (unsigned short *)NULL; } } mapdata.loaded = false; } unsigned short mapconvert::convert (unsigned short c) { if (!mapdata.loaded) { if (!mapfile.empty() && loadmapfile (mapfile, absentc)) { // do nothing, successfully loaded database } else return absentc; } if (c == 0) return 0; // 0 always maps to 0... unsigned short n1 = c >> 8; unsigned short n2 = c & 0xff; unsigned short *arrptr = mapdata.ptrs[n1]; if (arrptr == (unsigned short *)NULL) return absentc; if (arrptr[n2] == 0) return absentc; return arrptr[n2]; } text_t mapconvert::convert (const text_t &instr) { if (!mapdata.loaded) return absentc; text_t outstr; text_t::const_iterator here = instr.begin(); text_t::const_iterator end = instr.end(); while (here != end) { outstr.push_back(this->convert(*here)); here++; } return outstr; } mapinconvertclass::mapinconvertclass () { multibyte = 0; mapbuflen = 0; } void mapinconvertclass::reset () { start = NULL; len = 0; mapbuflen=0; } void mapinconvertclass::convert (text_t &output, status_t &status) { output.clear(); if (start == NULL || len == 0) { if (mapbuflen == 0) status = finished; else status = stopped; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; size_t charlen = getmapcharlen (); while (len > 0) { if (charlen == 0) { // start parsing a new character mapbuflen = 0; mapbuf[mapbuflen++] = *here; ++here; --len; charlen = getmapcharlen (); } else if (mapbuflen < charlen) { // assumes charlen is always less than MAXMAPCHARLEN mapbuf[mapbuflen++] = *here; ++here; --len; } if (mapbuflen == charlen) { // got a complete character if (charlen == 1) { if (mapbuf[0] < 0x80) { // ascii character output.push_back (mapbuf[0]); } else { output.push_back (converter.convert((unsigned short)mapbuf[0])); } } else { // two byte character output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) | (unsigned short)mapbuf[1])); } mapbuflen = 0; charlen = 0; } } start = (char *)here; // save current position if (mapbuflen == 0) status = finished; else status = stopped; } mapoutconvertclass::mapoutconvertclass () { multibyte = 0; mapbuflen=0; mapbufhere=0; } void mapoutconvertclass::reset () { input = NULL; outs = NULL; mapbuflen = 0; mapbufhere = 0; } // note that convert does not null-terminate the // output array of characters void mapoutconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { unsigned short outc; if (input == NULL || output == NULL) { if (mapbuflen == 0) status = finished; else status = unfinished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while (len < maxlen) { // empty the contents of the internal buffer if (mapbuflen > 0) { while (len < maxlen && mapbufhere < mapbuflen) { *uoutput = mapbuf[mapbufhere]; uoutput++; len++; mapbufhere++; } if (mapbufhere == mapbuflen) { mapbufhere = 0; mapbuflen = 0; } } // fill up the buffer with the next character if (mapbuflen == 0) { if (texthere == textend) break; // finished! if (!rzws || (*texthere != 0x200b)) { if (*texthere < 0x80) { mapbuf[0] = (unsigned char)*texthere; mapbuflen = 1; } else { outc = converter.convert (*texthere); if (multibyte) { mapbuf[0] = (unsigned char)(outc >> 8); mapbuf[1] = (unsigned char)(outc & 0xff); mapbuflen = 2; } else { mapbuf[0] = outc; mapbuflen = 1; } } } texthere++; mapbufhere = 0; } } if (texthere == textend && mapbuflen == 0) status = finished; else status = unfinished; } bool simplemapconvert::loadmapfile (bool in) { if (loaded) return true; if (mapfile.empty()) return false; char *cfilename = mapfile.getcstr(); #ifdef GSDL_USE_IOS_H ifstream mapfilein (cfilename, ios::in | ios::nocreate); #else ifstream mapfilein (cfilename, ios::in); #endif delete cfilename; if (!mapfilein) return false; char cline[2048]; text_t line; while (!mapfilein.eof()) { mapfilein.getline (cline, 2048); line.clear(); line.appendcstr (cline); if (line.empty()) continue; // remove comments text_t::iterator end = line.end(); text_t::iterator here = findchar (line.begin(), end, '#'); if (here != end) { line.erase (here, end); if (line.empty()) continue; } text_tarray parts; splitchar (line.begin(), line.end(), '\t', parts); // do some simple sanity checks if (parts.size() < 2) continue; text_t::iterator begin1 = parts[0].begin(); text_t::iterator begin2 = parts[1].begin(); if (*begin1 != '0' || *(begin1+1) != 'x') continue; if (*begin2 != '0' || *(begin2+1) != 'x') continue; char *from = parts[0].getcstr(); char *to = parts[1].getcstr(); unsigned int f = 0, t = 0; sscanf (from, "%i", &f); sscanf (to, "%i", &t); delete from; delete to; if (in) mapping[(unsigned short)f] = (unsigned short)t; else mapping[(unsigned short)t] = (unsigned short)f; } loaded = true; return true; } unsigned short simplemapconvert::convert (unsigned short c, bool in) { if (!loaded) if (!loadmapfile(in)) return absentc; return mapping[c]; } void simplemapinconvertclass::convert (text_t &output, status_t &status) { output.clear(); if (start == NULL || len == 0) { status = finished; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; while (len > 0) { if (*here < 0x80) output.push_back (*here); // append this character else output.push_back (converter.convert(*here, true)); ++here; --len; } start = (char *)here; // save current position status = finished; } void simplemapoutconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { if (input == NULL || output == NULL) { status = finished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while ((len < maxlen) && (texthere != textend)) { if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere); else *uoutput = converter.convert (*texthere, false); ++uoutput; ++len; ++texthere; } if (texthere == textend) status = finished; else status = unfinished; }