/********************************************************************** * * gsdlunicode.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * PUT COPYRIGHT NOTICE HERE * * $Id: gsdlunicode.cpp 100 1999-01-08 02:33:16Z rjmcnab $ * *********************************************************************/ /* $Log$ Revision 1.4 1999/01/08 02:33:15 rjmcnab Added standard header to source files. */ static char *RCSID = "$Id: gsdlunicode.cpp 100 1999-01-08 02:33:16Z rjmcnab $"; #include "gsdlunicode.h" // unitool is currently in mg, if mg is not being used it should // be moved into GSDLHOME/lib #include "unitool.h" #include "fileutil.h" #include // converts a unicode encode text_t string to a utf-8 // encoded text_t string text_t to_utf8 (const text_t &in) { text_t::const_iterator here = in.begin(); text_t::const_iterator end = in.end(); text_t out; unsigned char thischar[MAXUTF8CHARLEN]; int i, charlen; while (here != end) { charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]); for (i=0; i 0) { out.push_back(unichar); here += charlen; } delete in_cstr; return out; } utf8inconvertclass::utf8inconvertclass () { utf8buflen = 0; } void utf8inconvertclass::reset () { start = NULL; len = 0; utf8buflen=0; } void utf8inconvertclass::convert (text_t &output, status_t &status) { output.clear(); if (start == NULL || len == 0) { if (utf8buflen == 0) status = finished; else status = stopped; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; size_t charlen = getutf8charlen (); unsigned short c; size_t realcharlen; while (len > 0) { if (charlen == 0) { // start parsing a new character utf8buflen = 0; utf8buf[utf8buflen++] = *here; ++here; --len; charlen = getutf8charlen (); } else if (utf8buflen < charlen) { // assumes charlen is always less than MAXUTF8CHARLEN utf8buf[utf8buflen++] = *here; ++here; --len; } if (utf8buflen == charlen) { // got a complete character realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c); output.push_back (c); // move any unparsed characters. If an error occurred some of // the characters might be unused. int i; int diff = utf8buflen - realcharlen; for (i=0; i < diff; i++) utf8buf[i] = utf8buf[i+diff]; utf8buflen = diff; charlen = getutf8charlen (); } } start = (char *)here; // save current position if (utf8buflen == 0) status = finished; else status = stopped; } // returns the length that the current contents of the // utf8buf should be size_t utf8inconvertclass::getutf8charlen () { if (utf8buflen == 0) return 0; // one byte character if (utf8buf[0] < 0x80) return 1; // error, is not the start of a utf-8 character if (utf8buf[0] < 0xc0) return 1; // two bute character if (utf8buf[0] < 0xe0) return 2; // three byte character if (utf8buf[0] < 0xf0) return 3; // error, character too long for unicode return 1; } void utf8outconvertclass::reset () { input = NULL; outs = NULL; utf8buflen = 0; utf8bufhere = 0; } // note that convert does not null-terminate the // output array of characters void utf8outconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { if (input == NULL || output == NULL) { if (utf8buflen == 0) status = finished; else status = unfinished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while (len < maxlen) { // empty the contents of the internal buffer if (utf8buflen > 0) { while (len < maxlen && utf8bufhere < utf8buflen) { *uoutput = utf8buf[utf8bufhere]; uoutput++; len++; utf8bufhere++; } if (utf8bufhere == utf8buflen) { utf8bufhere = 0; utf8buflen = 0; } } // fill up the buffer with the next character if (utf8buflen == 0) { if (texthere == textend) break; // finished! if (!rzws || (*texthere != 0x200b)) utf8buflen = output_utf8_char (*texthere, utf8buf, &utf8buf[MAXUTF8CHARLEN-1]); texthere++; utf8bufhere = 0; } } if (texthere == textend && utf8buflen == 0) status = finished; else status = unfinished; } mapdata_t::mapdata_t () { int i; // reset all the map ptrs to be NULL for (i=0; i<256; i++) { ptrs[i] = (unsigned short *)NULL; } // say nothing has been loaded loaded = false; } mapconvert::mapconvert () { absentc = 0; } // loadmapfile should be called before any conversion is done bool mapconvert::loadmapfile (const text_t &thegsdlhome, const text_t &theencoding, unsigned short theabsentc) { FILE *mapfilein = (FILE *)NULL; // check to see if the mapfile has been already loaded if (mapdata.loaded && gsdlhome == thegsdlhome && encoding == theencoding && absentc == theabsentc) return true; unloadmapfile (); gsdlhome = thegsdlhome; encoding = theencoding; absentc = theabsentc; // open the map file text_t filename = filename_cat (gsdlhome, "unicode"); filename = filename_cat (filename, encoding); filename += ".ump"; char *cfilename = filename.getcstr(); if (cfilename == (char *)NULL) return false; mapfilein = fopen(cfilename, "rb"); delete cfilename; if (mapfilein == (FILE *)NULL) return false; unsigned char c, n1, n2; unsigned short *arrptr; int i; c = fgetc (mapfilein); while (!feof (mapfilein)) { if (mapdata.ptrs[c] == (unsigned short *)NULL) { // allocate a new array arrptr = new unsigned short[256]; mapdata.ptrs[c] = arrptr; } else arrptr = mapdata.ptrs[c]; // clear the array for (i=0; i<256; i++) arrptr[i] = 0; // read in this block n1 = fgetc (mapfilein); n2 = fgetc (mapfilein); i=0; while (!feof (mapfilein)) { arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2; i++; if (i >= 256) break; n1 = fgetc (mapfilein); n2 = fgetc (mapfilein); } c = fgetc (mapfilein); } mapdata.loaded = true; return true; } void mapconvert::unloadmapfile () { if (!mapdata.loaded) return; int i; for (i=0; i<256; i++) { if (mapdata.ptrs[i] != (unsigned short *)NULL) { delete [] mapdata.ptrs[i]; mapdata.ptrs[i] = (unsigned short *)NULL; } } mapdata.loaded = false; } unsigned short mapconvert::convert (unsigned short c) { if (!mapdata.loaded) return absentc; if (c == 0) return 0; // 0 always maps to 0... unsigned short n1 = c >> 8; unsigned short n2 = c & 0xff; unsigned short *arrptr = mapdata.ptrs[n1]; if (arrptr == (unsigned short *)NULL) return absentc; if (arrptr[n2] == 0) return absentc; return arrptr[n2]; } text_t mapconvert::convert (const text_t &instr) { if (!mapdata.loaded) return absentc; text_t outstr; text_t::const_iterator here = instr.begin(); text_t::const_iterator end = instr.end(); while (here != end) { outstr.push_back(this->convert(*here)); here++; } return outstr; } mapinconvertclass::mapinconvertclass () { mapbuflen = 0; } void mapinconvertclass::reset () { start = NULL; len = 0; mapbuflen=0; } void mapinconvertclass::convert (text_t &output, status_t &status) { output.clear(); if (start == NULL || len == 0) { if (mapbuflen == 0) status = finished; else status = stopped; return; } // don't want any funny sign conversions happening unsigned char *here = (unsigned char *)start; size_t charlen = getmapcharlen (); unsigned short c; size_t realcharlen; while (len > 0) { if (charlen == 0) { // start parsing a new character mapbuflen = 0; mapbuf[mapbuflen++] = *here; ++here; --len; charlen = getmapcharlen (); } else if (mapbuflen < charlen) { // assumes charlen is always less than MAXMAPCHARLEN mapbuf[mapbuflen++] = *here; ++here; --len; } if (mapbuflen == charlen) { // got a complete character if (charlen == 1) { // ascii character output.push_back (mapbuf[0]); } else { // two byte character output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) | (unsigned short)mapbuf[1])); } mapbuflen = 0; charlen = 0; } } start = (char *)here; // save current position if (mapbuflen == 0) status = finished; else status = stopped; } mapoutconvertclass::mapoutconvertclass () { mapbuflen=0; mapbufhere=0; } void mapoutconvertclass::reset () { input = NULL; outs = NULL; mapbuflen = 0; mapbufhere = 0; } // note that convert does not null-terminate the // output array of characters void mapoutconvertclass::convert (char *output, size_t maxlen, size_t &len, status_t &status) { unsigned short outc; if (input == NULL || output == NULL) { if (mapbuflen == 0) status = finished; else status = unfinished; return; } // don't want any funny sign conversions happening unsigned char *uoutput = (unsigned char *)output; text_t::iterator textend = input->end(); len = 0; while (len < maxlen) { // empty the contents of the internal buffer if (mapbuflen > 0) { while (len < maxlen && mapbufhere < mapbuflen) { *uoutput = mapbuf[mapbufhere]; uoutput++; len++; mapbufhere++; } if (mapbufhere == mapbuflen) { mapbufhere = 0; mapbuflen = 0; } } // fill up the buffer with the next character if (mapbuflen == 0) { if (texthere == textend) break; // finished! if (!rzws || (*texthere != 0x200b)) { if (*texthere < 0x80) { mapbuf[0] = (unsigned char)*texthere; mapbuflen = 1; } else { outc = converter.convert (*texthere); mapbuf[0] = (unsigned char)(outc >> 8); mapbuf[1] = (unsigned char)(outc & 0xff); mapbuflen = 2; } } texthere++; mapbufhere = 0; } } if (texthere == textend && mapbuflen == 0) status = finished; else status = unfinished; }