/************************************************************************** * * UCArray.cpp -- vector based string class * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "UCArray.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "unitool.h" void SetCStr (UCArray &text, const char *cStr) { text.erase(text.begin(), text.end()); while (*cStr != '\0') { text.push_back (*cStr); ++cStr; } } void SetCStr (UCArray &text, const char *cStr, size_t nSizeHint) { text.erase(text.begin(), text.end()); // reserve the needed space in advance if (text.capacity() < nSizeHint + 1) { text.reserve(nSizeHint + 1); } while (*cStr != '\0') { text.push_back (*cStr); ++cStr; } } char * GetCStr(const UCArray& text) { char *cstr = new char[text.size()+1]; UCArray::const_iterator here = text.begin(); UCArray::const_iterator end = text.end(); int i = 0; while (here != end) { cstr[i] = text[i]; ++here; ++i; } cstr[i]='\0'; return cstr; } unsigned char * MyGetCStr(const UCArray& text) { unsigned char *cstr = new unsigned char[text.size()+1]; cstr[0] = text.size(); UCArray::const_iterator here = text.begin(); UCArray::const_iterator end = text.end(); int i = 1; while (here != end) { cstr[i] = text[i-1]; ++here; ++i; } // cstr[i]='\0'; return cstr; } bool UCArrayCStrEquals(const UCArray &text, const unsigned char *cStr) { if ((cStr == NULL || *cStr == '\0') && text.empty()) return true; UCArray::const_iterator thisUC = text.begin(); UCArray::const_iterator endUC = text.end(); while (thisUC != endUC && *cStr != '\0') { if (*thisUC != *cStr) return false; ++cStr; ++thisUC; } if (thisUC == endUC && *cStr == '\0') return true; return false; } ostream &operator<<(ostream &s, const UCArray &a) { UCArray::const_iterator here = a.begin(); UCArray::const_iterator end = a.end(); while (here != end) { s << *here; ++here; } return s; } bool ReadVarLenUL (FILE *f, unsigned long &n) { register unsigned long temp = 0; register unsigned int bitPos = 0; unsigned char b = 0; do { b = fgetc (f); if (feof(f)) return false; temp |= (b & 0x7f) << bitPos; bitPos += 7; } while (b >= 0x80 && bitPos < 32); n = temp; return true; } bool WriteVarLenUL (FILE *f, unsigned long n) { register unsigned long temp = n; register unsigned char b = 0; do { b = static_cast (temp & 0x7f); if (temp >= 0x80) b |= 0x80; fputc (b, f); if (ferror (f) != 0) return false; } while ((temp = temp >> 7) > 0); return true; } bool ReadUL (FILE *f, unsigned long &n) { if (fread (&n, sizeof (unsigned long), 1, f) <= 0) return false; NTOHUL (n); return true; } bool WriteUL (FILE *f, unsigned long n) { HTONUL (n); return (fwrite (&n, sizeof (unsigned long), 1, f) > 0); } bool ReadF (FILE *f, float &n) { if (fread (&n, sizeof (float), 1, f) <= 0) return false; NTOHF(n); return true; } bool WriteF (FILE *f, float n) { HTONF(n); return (fwrite (&n, sizeof (float), 1, f) > 0); } bool ReadD (FILE *f, double &n) { if (fread (&n, sizeof (double), 1, f) <= 0) return false; NTOHD(n); return true; } bool WriteD (FILE *f, double n) { HTOND(n); return (fwrite (&n, sizeof (double), 1, f) > 0); } bool ReadUCArray (FILE *f, UCArray &a) { // clear the array in preparation a.erase (a.begin(), a.end()); // read in the array size unsigned long arraySize = 0; if (!ReadVarLenUL (f, arraySize)) return false; // reserve the needed space in advance if (a.capacity() < arraySize + 1) { a.reserve(arraySize + 1); } // read in the array unsigned char b = 0; while (arraySize > 0) { b = fgetc (f); if (feof(f)) return false; a.push_back (b); --arraySize; } return true; } bool WriteUCArray (FILE *f, const UCArray &a) { // write out the array size if (!WriteVarLenUL (f, a.size())) return false; UCArray::const_iterator here = a.begin(); UCArray::const_iterator end = a.end(); while (here != end) { fputc (*here, f); if (ferror (f) != 0) return false; ++here; } return true; } /* * This array is designed for mapping upper and lower case letter * together for a case independent comparison. The mappings are * based upon ascii character sequences. */ // static unsigned char casecharmap[] = { // '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', // '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', // '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', // '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', // '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', // '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', // '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', // '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', // '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', // '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', // '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', // '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', // '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', // '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', // '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', // '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', // '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', // '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', // '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', // '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', // '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', // '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', // '\300', '\341', '\342', '\343', '\344', '\345', '\346', '\347', // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', // '\370', '\371', '\372', '\333', '\334', '\335', '\336', '\337', // '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', // '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', // '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', // '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', // }; int DictCompare (const UCArray &a1, const UCArray &a2) { unsigned short a1_out[256]; /* temp space */ unsigned short a2_out[256]; /* temp space */ unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); /* decode the words to unicode */ utf8_word_to_unicode (a1_str, a1_out, 255); utf8_word_to_unicode (a2_str, a2_out, 255); int l1 = a1_out[0]; int l2 = a2_out[0]; int len = (l1 < l2) ? l1 : l2; int pos = 0; int diff = 0; for (int i=1; i<=len; ++i) { if ((diff = unicode_tosimplified(unicode_tolower(a1_out[i])) - unicode_tosimplified(unicode_tolower(a2_out[i]))) != 0) { return diff; } if ((pos == 0) && (diff = a1_out[i] - a2_out[i]) != 0) { pos = diff; } } return ((l1 - l2) ? (l1 - l2) : (pos)); } // int DictCompare (const UCArray &a1, const UCArray &a2) { // unsigned int l1 = a1.size(); // unsigned int l2 = a2.size(); // unsigned int l = (l1 < l2) ? l1 : l2; // int pos = 0; // register int diff = 0; // UCArray::const_iterator a1Here = a1.begin(); // UCArray::const_iterator a2Here = a2.begin(); // while (l--) { // if ((diff = casecharmap[*a1Here] - casecharmap[*a2Here]) != 0) // return diff; // if (pos == 0 && (diff = *a1Here - *a2Here) != 0) // pos = diff; // ++a1Here; // ++a2Here; // } // return ((l1 - l2) ? (l1 - l2) : (pos)); // } // does the first string start with the second? bool StartsWith (const UCArray &a1, const UCArray &a2) { unsigned int l1 = a1.size(); unsigned int l2 = a2.size(); if (l2 > l1) { // if the prefix is longer than the string, it can't start with it return false; } unsigned int l =l2; UCArray::const_iterator a1Here = a1.begin(); UCArray::const_iterator a2Here = a2.begin(); while (l--) { if ((*a1Here != *a2Here)) return false; ++a1Here; ++a2Here; } return true; // we have successfully matched the whole way } // does the first string start with the second, ignoring case? bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) { unsigned int l1 = a1.size(); unsigned int l2 = a2.size(); if (l2 > l1) { // if the prefix is longer than the string, it can't start with it return false; } unsigned short a1_out[256]; /* temp space */ unsigned short a2_out[256]; /* temp space */ unsigned char * a1_str = (unsigned char *)MyGetCStr(a1); unsigned char * a2_str = (unsigned char *)MyGetCStr(a2); /* decode the words to unicode */ utf8_word_to_unicode (a1_str, a1_out, 255); utf8_word_to_unicode (a2_str, a2_out, 255); unsigned int len = a2_out[0];; for (int i=1; i<=len; ++i) { if (unicode_tosimplified(unicode_tolower(a1_out[i])) != unicode_tosimplified(unicode_tolower(a2_out[i])) ) return false; } return true; // we have successfully matched the whole way } // does the first string start with the second, ignoring case? // bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) { // unsigned int l1 = a1.size(); // unsigned int l2 = a2.size(); // if (l2 > l1) { // // if the prefix is longer than the string, it can't start with it // return false; // } // unsigned int l =l2; // UCArray::const_iterator a1Here = a1.begin(); // UCArray::const_iterator a2Here = a2.begin(); // while (l--) { // if (casecharmap[*a1Here] != casecharmap[*a2Here]) // return false; // ++a1Here; // ++a2Here; // } // return true; // we have successfully matched the whole way // } unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) { unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size(); unsigned long i = 0; UCArray::const_iterator a1Here = a1.begin(); UCArray::const_iterator a2Here = a2.begin(); while (i < l && *a1Here == *a2Here) { ++i; ++a1Here; ++a2Here; } return i; } bool WritePreSufStr (FILE *f, const UCArray *prev, const UCArray &a) { unsigned char preLen; unsigned char sufLen; if (prev != NULL) preLen = PrefixLen (*prev, a); else preLen = 0; sufLen = a.size() - preLen; // output the prefix length, suffix length, and the suffix fputc (preLen, f); if (ferror(f) != 0) return false; fputc (sufLen, f); if (ferror(f) != 0) return false; char* tmp=GetCStr(a); int ret=(fwrite (tmp+preLen, sizeof (char), sufLen, f) == sufLen); delete []tmp; return (ret != 0); } // a also used for prev bool ReadPreSufStr (FILE *f, UCArray &a) { unsigned char preLen = 0; unsigned char sufLen = 0; preLen = fgetc(f); sufLen = fgetc(f); if (a.size() > preLen) a.erase (a.begin()+preLen, a.end()); // reserve the needed space in advance if (a.capacity() < a.size() + sufLen + 1) { a.reserve(a.size() + sufLen + 1); } while (sufLen > 0) { unsigned char c = fgetc (f); a.push_back (c); --sufLen; } return true; }