Changeset 94 for trunk/gsdl


Ignore:
Timestamp:
1999-01-04T16:32:21+13:00 (25 years ago)
Author:
rjmcnab
Message:

Wrote general map file based in and out converters. Fixed bugs related
to Chinese charater searching. text_t now has a encoding attribute. Added
an encoding option to the preferences.

Location:
trunk/gsdl
Files:
11 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/lib/Makefile

    r93 r94  
    2626CC = gcc
    2727CCFLAGS =
    28 DEFS = -O2 -g
     28DEFS = -g
    2929RANLIB = ranlib
    3030INCLUDES = -I../packages/mg-1.3d/lib
     
    4949OBJECTS = text_t.o display.o cfgread.o fileutil.o gsdlunicode.o
    5050 
    51 all : gsdllib.a
     51all: gsdllib.a
    5252
    53 gsdllib.a : $(OBJECTS)
     53gsdllib.a: $(OBJECTS)
    5454    rm -f gsdllib.a
    5555    $(AR) cru gsdllib.a $(OBJECTS)
     
    5959    rm -f $(OBJECTS) gsdllib.a
    6060
     61install:
     62
    6163depend:
    6264    makedepend -Y -- $(DEFS) $(INCLUDES) $(CCFLAGS) -- $(SOURCES)
     
    6567
    6668text_t.o: text_t.h
    67 display.o: display.h text_t.h
     69display.o: display.h text_t.h gsdlunicode.h
    6870cfgread.o: cfgread.h text_t.h
    6971fileutil.o: fileutil.h text_t.h
     72gsdlunicode.o: gsdlunicode.h text_t.h ../packages/mg-1.3d/lib/unitool.h
     73gsdlunicode.o: fileutil.h
  • trunk/gsdl/lib/gsdlunicode.cpp

    r93 r94  
    99#include <stdio.h>
    1010
    11 
    12 mapdata_t::mapdata_t () {
    13   int i;
    14 
    15   // reset all the map ptrs to be NULL
    16   for (i=0; i<256; i++) {
    17     ptrs[i] = (unsigned short *)NULL;
    18   }
    19 
    20   // say nothing has been loaded
    21   loaded = false;
    22 }
    23 
    24 
    25 mapconvert::mapconvert (const text_t &thegsdlhome, const text_t &theencoding,
    26             unsigned short theabsentc) {
    27   gsdlhome = thegsdlhome;
    28   encoding = theencoding;
    29   absentc = theabsentc;
    30 }
    31 
    32 unsigned short mapconvert::convert (unsigned short c) {
    33   if (!loadmapfile()) return absentc;
    34 
    35   if (c == 0) return 0; // 0 always maps to 0...
    36 
    37   unsigned short n1 = c >> 8;
    38   unsigned short n2 = c & 0xff;
    39 
    40   unsigned short *arrptr = mapdata.ptrs[n1];
    41   if (arrptr == (unsigned short *)NULL) return absentc;
    42 
    43   if (arrptr[n2] == 0) return absentc;
    44   return arrptr[n2];
    45 }
    46 
    47 text_t mapconvert::convert (const text_t &instr) {
    48   if (!loadmapfile()) return absentc;
    49 
    50   text_t outstr;
    51   text_t::const_iterator here = instr.begin();
    52   text_t::const_iterator end = instr.end();
    53 
    54   while (here != end) {
    55     outstr.push_back(this->convert(*here));
    56     here++;
    57   }
    58  
    59   return outstr;
    60 }
    61 
    62 bool mapconvert::loadmapfile () {
    63   FILE *mapfilein = (FILE *)NULL;
    64 
    65   // check to see if the mapfile has been already loaded
    66   if (mapdata.loaded) return true;
    67 
    68   // open the map file
    69   text_t filename = filename_cat (gsdlhome, "unicode");
    70   filename = filename_cat (filename, encoding);
    71   filename += ".ump";
    72   char *cfilename = filename.getcstr();
    73   if (cfilename == (char *)NULL) return false;
    74   mapfilein = fopen(cfilename, "rb");
    75   delete cfilename;
    76 
    77   if (mapfilein == (FILE *)NULL) return false;
    78 
    79   unsigned char c, n1, n2;
    80   unsigned short *arrptr;
    81   int i;
    82   c = fgetc (mapfilein);
    83   while (!feof (mapfilein)) {
    84     if (mapdata.ptrs[c] == (unsigned short *)NULL) {
    85       // allocate a new array
    86       arrptr = new unsigned short[256];
    87       mapdata.ptrs[c] = arrptr;
    88     } else arrptr = mapdata.ptrs[c];
    89 
    90     // clear the array
    91     for (i=0; i<256; i++) arrptr[i] = 0;
    92    
    93     // read in this block
    94     n1 = fgetc (mapfilein);
    95     n2 = fgetc (mapfilein);
    96     i=0;
    97     while (!feof (mapfilein)) {
    98       arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
    99 
    100       i++;
    101       if (i >= 256) break;
    102       n1 = fgetc (mapfilein);
    103       n2 = fgetc (mapfilein);
    104     }
    105 
    106     c = fgetc (mapfilein);
    107   }
    108 
    109   mapdata.loaded = true;
    110 
    111   return true;
    112 }
    113 
    114 void mapconvert::unloadmapfile () {
    115   if (!mapdata.loaded) return;
    116 
    117   int i;
    118   for (i=0; i<256; i++) {
    119     if (mapdata.ptrs[i] != (unsigned short *)NULL) {
    120       delete [] mapdata.ptrs[i];
    121       mapdata.ptrs[i] = (unsigned short *)NULL;
    122     }
    123   }
    124 
    125   mapdata.loaded = false;
    126 }
    12711
    12812
     
    18973
    19074void utf8inconvertclass::convert (text_t &output, status_t &status) {
     75  output.clear();
     76
    19177  if (start == NULL || len == 0) {
    19278    if (utf8buflen == 0) status = finished;
     
    261147
    262148
    263 
    264 gbinconvertclass::gbinconvertclass (const text_t &gsdlhome) {
    265   gbbuflen = 0;
    266   gb2unimap = new mapconvert (gsdlhome, "gbku", 0x25a1);
    267 }
    268 
    269 void gbinconvertclass::reset () {
    270   start = NULL;
    271   len = 0;
    272   gbbuflen=0;
    273 }
    274 
    275 void gbinconvertclass::convert (text_t &output, status_t &status) {
    276   if (start == NULL || len == 0 || gb2unimap == NULL) {
    277     if (gbbuflen == 0) status = finished;
    278     else status = stopped;
    279     return;
    280   }
    281 
    282   // don't want any funny sign conversions happening
    283   unsigned char *here = (unsigned char *)start;
    284 
    285   size_t charlen = getgbcharlen ();
    286   unsigned short c;
    287   size_t realcharlen;
    288   while (len > 0) {
    289     if (charlen == 0) {
    290       // start parsing a new character
    291       gbbuflen = 0;
    292       gbbuf[gbbuflen++] = *here;
    293       ++here;
    294       --len;
    295       charlen = getgbcharlen ();
    296 
    297     } else if (gbbuflen < charlen) {
    298       // assumes charlen is always less than MAXGBCHARLEN
    299       gbbuf[gbbuflen++] = *here;
    300       ++here;
    301       --len;
    302     }
    303 
    304     if (gbbuflen == charlen) {
    305       // got a complete character
    306       if (charlen == 1) {
    307     // ascii character
    308     output.push_back (gbbuf[0]);
    309 
    310       } else {
    311     // two byte character
    312     output.push_back (gb2unimap->convert(((unsigned short)gbbuf[0] << 8) |
    313                          (unsigned short)gbbuf[1]));
    314       }
    315 
    316       gbbuflen = 0;
    317       charlen = 0;
    318     }
    319   }
    320 
    321   start = (char *)here; // save current position
    322 
    323   if (gbbuflen == 0) status = finished;
    324   else status = stopped;
    325 }
    326 
    327 // returns the length that the current contents of the
    328 // gbbuf should be
    329 size_t gbinconvertclass::getgbcharlen () {
    330   if (gbbuflen == 0) return 0;
    331 
    332   // one byte character
    333   if (gbbuf[0] < 0x80) return 1;
    334 
    335   // other characters are two byte
    336   return 2;
    337 }
    338 
    339 
    340 
    341 
    342 
    343 
    344149void utf8outconvertclass::reset () {
    345150  input = NULL;
     
    397202
    398203
    399 gboutconvertclass::gboutconvertclass (const text_t &gsdlhome){
    400   gbbuflen=0;
    401   gbbufhere=0;
    402   uni2gbmap = new mapconvert (gsdlhome, "ugbk", 0xa1f5);
    403 }
    404 
    405 void gboutconvertclass::reset () {
     204
     205
     206mapdata_t::mapdata_t () {
     207  int i;
     208
     209  // reset all the map ptrs to be NULL
     210  for (i=0; i<256; i++) {
     211    ptrs[i] = (unsigned short *)NULL;
     212  }
     213
     214  // say nothing has been loaded
     215  loaded = false;
     216}
     217
     218
     219mapconvert::mapconvert () {
     220  absentc = 0;
     221}
     222
     223
     224// loadmapfile should be called before any conversion is done
     225bool mapconvert::loadmapfile (const text_t &thegsdlhome,
     226                  const text_t &theencoding,
     227                  unsigned short theabsentc) {
     228  FILE *mapfilein = (FILE *)NULL;
     229
     230  // check to see if the mapfile has been already loaded
     231  if (mapdata.loaded && gsdlhome == thegsdlhome &&
     232      encoding == theencoding && absentc == theabsentc)
     233    return true;
     234
     235  unloadmapfile ();
     236  gsdlhome = thegsdlhome;
     237  encoding = theencoding;
     238  absentc = theabsentc;
     239
     240  // open the map file
     241  text_t filename = filename_cat (gsdlhome, "unicode");
     242  filename = filename_cat (filename, encoding);
     243  filename += ".ump";
     244  char *cfilename = filename.getcstr();
     245  if (cfilename == (char *)NULL) return false;
     246  mapfilein = fopen(cfilename, "rb");
     247  delete cfilename;
     248
     249  if (mapfilein == (FILE *)NULL) return false;
     250
     251  unsigned char c, n1, n2;
     252  unsigned short *arrptr;
     253  int i;
     254  c = fgetc (mapfilein);
     255  while (!feof (mapfilein)) {
     256    if (mapdata.ptrs[c] == (unsigned short *)NULL) {
     257      // allocate a new array
     258      arrptr = new unsigned short[256];
     259      mapdata.ptrs[c] = arrptr;
     260    } else arrptr = mapdata.ptrs[c];
     261
     262    // clear the array
     263    for (i=0; i<256; i++) arrptr[i] = 0;
     264   
     265    // read in this block
     266    n1 = fgetc (mapfilein);
     267    n2 = fgetc (mapfilein);
     268    i=0;
     269    while (!feof (mapfilein)) {
     270      arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
     271
     272      i++;
     273      if (i >= 256) break;
     274      n1 = fgetc (mapfilein);
     275      n2 = fgetc (mapfilein);
     276    }
     277
     278    c = fgetc (mapfilein);
     279  }
     280
     281  mapdata.loaded = true;
     282
     283  return true;
     284}
     285
     286void mapconvert::unloadmapfile () {
     287  if (!mapdata.loaded) return;
     288
     289  int i;
     290  for (i=0; i<256; i++) {
     291    if (mapdata.ptrs[i] != (unsigned short *)NULL) {
     292      delete [] mapdata.ptrs[i];
     293      mapdata.ptrs[i] = (unsigned short *)NULL;
     294    }
     295  }
     296
     297  mapdata.loaded = false;
     298}
     299
     300
     301unsigned short mapconvert::convert (unsigned short c) {
     302  if (!mapdata.loaded) return absentc;
     303
     304  if (c == 0) return 0; // 0 always maps to 0...
     305
     306  unsigned short n1 = c >> 8;
     307  unsigned short n2 = c & 0xff;
     308
     309  unsigned short *arrptr = mapdata.ptrs[n1];
     310  if (arrptr == (unsigned short *)NULL) return absentc;
     311
     312  if (arrptr[n2] == 0) return absentc;
     313  return arrptr[n2];
     314}
     315
     316text_t mapconvert::convert (const text_t &instr) {
     317  if (!mapdata.loaded) return absentc;
     318
     319  text_t outstr;
     320  text_t::const_iterator here = instr.begin();
     321  text_t::const_iterator end = instr.end();
     322
     323  while (here != end) {
     324    outstr.push_back(this->convert(*here));
     325    here++;
     326  }
     327 
     328  return outstr;
     329}
     330
     331
     332
     333
     334mapinconvertclass::mapinconvertclass () {
     335  mapbuflen = 0;
     336}
     337
     338void mapinconvertclass::reset () {
     339  start = NULL;
     340  len = 0;
     341  mapbuflen=0;
     342}
     343
     344void mapinconvertclass::convert (text_t &output, status_t &status) {
     345  output.clear();
     346
     347  if (start == NULL || len == 0) {
     348    if (mapbuflen == 0) status = finished;
     349    else status = stopped;
     350    return;
     351  }
     352
     353  // don't want any funny sign conversions happening
     354  unsigned char *here = (unsigned char *)start;
     355
     356  size_t charlen = getmapcharlen ();
     357  unsigned short c;
     358  size_t realcharlen;
     359  while (len > 0) {
     360    if (charlen == 0) {
     361      // start parsing a new character
     362      mapbuflen = 0;
     363      mapbuf[mapbuflen++] = *here;
     364      ++here;
     365      --len;
     366      charlen = getmapcharlen ();
     367
     368    } else if (mapbuflen < charlen) {
     369      // assumes charlen is always less than MAXMAPCHARLEN
     370      mapbuf[mapbuflen++] = *here;
     371      ++here;
     372      --len;
     373    }
     374
     375    if (mapbuflen == charlen) {
     376      // got a complete character
     377      if (charlen == 1) {
     378    // ascii character
     379    output.push_back (mapbuf[0]);
     380
     381      } else {
     382    // two byte character
     383    output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
     384                        (unsigned short)mapbuf[1]));
     385      }
     386
     387      mapbuflen = 0;
     388      charlen = 0;
     389    }
     390  }
     391
     392  start = (char *)here; // save current position
     393
     394  if (mapbuflen == 0) status = finished;
     395  else status = stopped;
     396}
     397
     398
     399
     400mapoutconvertclass::mapoutconvertclass () {
     401  mapbuflen=0;
     402  mapbufhere=0;
     403}
     404
     405void mapoutconvertclass::reset () {
    406406  input = NULL;
    407407  outs = NULL;
    408   gbbuflen = 0;
    409   gbbufhere = 0;
     408  mapbuflen = 0;
     409  mapbufhere = 0;
    410410}
    411411
    412412// note that convert does not null-terminate the
    413413// output array of characters
    414 void gboutconvertclass::convert (char *output, size_t maxlen,
     414void mapoutconvertclass::convert (char *output, size_t maxlen,
    415415                 size_t &len, status_t &status) {
    416416  unsigned short outc;
    417417
    418   if (input == NULL || output == NULL || uni2gbmap == NULL) {
    419     if (gbbuflen == 0) status = finished;
     418  if (input == NULL || output == NULL) {
     419    if (mapbuflen == 0) status = finished;
    420420    else status = unfinished;
    421421    return;
     
    428428  while (len < maxlen) {
    429429    // empty the contents of the internal buffer
    430     if (gbbuflen > 0) {
    431       while (len < maxlen && gbbufhere < gbbuflen) {
    432     *uoutput = gbbuf[gbbufhere];
     430    if (mapbuflen > 0) {
     431      while (len < maxlen && mapbufhere < mapbuflen) {
     432    *uoutput = mapbuf[mapbufhere];
    433433    uoutput++;
    434434    len++;
    435     gbbufhere++;
    436       }
    437 
    438       if (gbbufhere == gbbuflen) {
    439     gbbufhere = 0;
    440     gbbuflen = 0;
     435    mapbufhere++;
     436      }
     437
     438      if (mapbufhere == mapbuflen) {
     439    mapbufhere = 0;
     440    mapbuflen = 0;
    441441      }
    442442    }
    443443
    444444    // fill up the buffer with the next character
    445     if (gbbuflen == 0) {
     445    if (mapbuflen == 0) {
    446446      if (texthere == textend) break; // finished!
    447447      if (!rzws || (*texthere != 0x200b)) {
    448448    if (*texthere < 0x80) {
    449       gbbuf[0] = (unsigned char)*texthere;
    450       gbbuflen = 1;
     449      mapbuf[0] = (unsigned char)*texthere;
     450      mapbuflen = 1;
    451451    } else {
    452       outc = uni2gbmap->convert (*texthere);
    453       gbbuf[0] = (unsigned char)(outc >> 8);
    454       gbbuf[1] = (unsigned char)(outc & 0xff);
    455       gbbuflen = 2;
     452      outc = converter.convert (*texthere);
     453      mapbuf[0] = (unsigned char)(outc >> 8);
     454      mapbuf[1] = (unsigned char)(outc & 0xff);
     455      mapbuflen = 2;
    456456    }
    457457      }
    458458
    459459      texthere++;
    460       gbbufhere = 0;
     460      mapbufhere = 0;
    461461    }
    462462  }
    463463 
    464   if (texthere == textend && gbbuflen == 0) status = finished;
     464  if (texthere == textend && mapbuflen == 0) status = finished;
    465465  else status = unfinished;
    466466}
  • trunk/gsdl/lib/gsdlunicode.h

    r93 r94  
    33
    44#include "text_t.h"
    5 
    6 // mapdata_t is used by mapconvert to hold the map file data
    7 class mapdata_t {
    8 public:
    9   mapdata_t();
    10   bool loaded;
    11   unsigned short *ptrs[256];
    12 };
    13 
    14 
    15 // mapconvert is used in situations where conversion is best
    16 // done using a map file. The mapfile should reside in
    17 // gsdlhome/unicode.
    18 class mapconvert {
    19 public:
    20   mapconvert (const text_t &thegsdlhome, const text_t &theencoding,
    21           unsigned short theabsentc);
    22   ~mapconvert () {unloadmapfile();};
    23 
    24   unsigned short convert (unsigned short c);
    25   text_t convert (const text_t &instr);
    26 
    27 protected:
    28   text_t gsdlhome;
    29   text_t encoding;
    30   unsigned short absentc;
    31   mapdata_t mapdata;
    32 
    33   bool loadmapfile ();
    34   void unloadmapfile ();
    35 };
    36 
    37 
    385
    396
     
    6532  // utf8buf should be
    6633  size_t getutf8charlen ();
    67 };
    68 
    69 
    70 #define MAXGBCHARLEN 2
    71 
    72 // convert from a gb char stream to the unicode text_t class
    73 class gbinconvertclass : public inconvertclass {
    74 public:
    75   gbinconvertclass(const text_t &gsdlhome);
    76   ~gbinconvertclass () {delete gb2unimap;};
    77   void reset ();
    78   void convert (text_t &output, status_t &status);
    79 
    80 protected:
    81   // buffer to hold unconverted characters in a stream
    82   unsigned char gbbuf[MAXGBCHARLEN];
    83   size_t gbbuflen;
    84 
    85   // note: multiple instances of gbinconvert class are expensive
    86   // as each will have its own copy of the map file data. This
    87   // could be reduced by making gb2unimap static, but then it
    88   // wouldn't be thread safe.
    89   mapconvert *gb2unimap;
    90 
    91   // returns the length that the current contents of the
    92   // gbbuf should be
    93   size_t getgbcharlen ();
    9434};
    9535
     
    12969
    13070
     71// mapdata_t is used by mapconvert to hold the map file data
     72class mapdata_t {
     73public:
     74  mapdata_t();
     75  bool loaded;
     76  unsigned short *ptrs[256];
     77};
     78
     79// mapconvert is used in situations where conversion is best
     80// done using a map file. The mapfile should reside in
     81// gsdlhome/unicode.
     82class mapconvert {
     83public:
     84  mapconvert ();
     85  ~mapconvert () {unloadmapfile();};
     86
     87  // loadmapfile should be called before any conversion is done
     88  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
     89            unsigned short theabsentc);
     90  void unloadmapfile ();
     91
     92  unsigned short convert (unsigned short c);
     93
     94  // note that this version of convert has different semantics to
     95  // the convertclass version.
     96  text_t convert (const text_t &instr);
     97
     98protected:
     99  text_t gsdlhome;
     100  text_t encoding;
     101  unsigned short absentc;
     102  mapdata_t mapdata;
     103};
    131104
    132105
    133 // Convert from a text_t class to a gb char stream
    134 class gboutconvertclass : public rzwsoutconvertclass {
     106
     107#define MAXMAPCHARLEN 2
     108
     109// convert from a gb char stream to the unicode text_t class
     110class mapinconvertclass : public inconvertclass {
    135111public:
    136   gboutconvertclass (const text_t &gsdhome);
    137   ~gboutconvertclass () {delete uni2gbmap;};
     112  mapinconvertclass();
     113
     114  // loadmapfile should be called before any conversion takes
     115  // place
     116  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
     117            unsigned short theabsentc) {
     118    return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
     119  };
     120
    138121  void reset ();
    139   // note that convert does not null-terminate the
    140   // output array of characters
     122  void convert (text_t &output, status_t &status);
     123
     124protected:
     125  // buffer to hold unconverted characters in a stream
     126  unsigned char mapbuf[MAXMAPCHARLEN];
     127  size_t mapbuflen;
     128
     129  // note: multiple instances of mapinconvert class are expensive
     130  // as each will have its own copy of the map file data. This
     131  // could be reduced by making map2unimap static, but then it
     132  // wouldn't be thread safe.
     133  mapconvert converter;
     134
     135  // returns the length that the current contents of the
     136  // mapbuf should be
     137  inline size_t getmapcharlen () {
     138    if (mapbuflen == 0) return 0;
     139    if (mapbuf[0] < 0x80) return 1;
     140    return 2;
     141  }
     142};
     143
     144
     145// Convert from a text_t class to a map char stream
     146class mapoutconvertclass : public rzwsoutconvertclass {
     147public:
     148  mapoutconvertclass ();
     149
     150  // loadmapfile should be called before any conversion takes
     151  // place
     152  bool loadmapfile (const text_t &thegsdlhome, const text_t &theencoding,
     153            unsigned short theabsentc) {
     154    return converter.loadmapfile (thegsdlhome, theencoding, theabsentc);
     155  };
     156
     157  void reset ();
    141158  void convert (char *output, size_t maxlen,
    142159        size_t &len, status_t &status);
    143160
    144161protected:
    145   unsigned char gbbuf[MAXGBCHARLEN];
    146   size_t gbbuflen;
    147   size_t gbbufhere;
     162  unsigned char mapbuf[MAXMAPCHARLEN];
     163  size_t mapbuflen;
     164  size_t mapbufhere;
    148165
    149   mapconvert *uni2gbmap;
     166  mapconvert converter;
    150167};
    151168
  • trunk/gsdl/lib/text_t.cpp

    r12 r94  
    2424/*
    2525   $Log$
     26   Revision 1.2  1999/01/04 03:32:17  rjmcnab
     27
     28   Wrote general map file based in and out converters. Fixed bugs related
     29   to Chinese charater searching. text_t now has a encoding attribute. Added
     30   an encoding option to the preferences.
     31
    2632   Revision 1.1  1998/11/17 09:11:29  rjmcnab
    2733
     
    7076
    7177#include "text_t.h"
     78#include "unitool.h"
    7279
    7380////////////////////////////////////
     
    7784text_t::text_t ()
    7885{
     86  setencoding(0);
    7987  clear ();
    8088}
     
    8290text_t::text_t (int i)
    8391{
     92  setencoding(0);
    8493  clear ();
    8594  appendint (i);
     
    8796
    8897text_t::text_t (char *s)
    89 {
     98{
     99  setencoding(0);
    90100  clear ();
    91101  appendcstr (s);
     
    223233  while (ithere != itend)
    224234    {
    225       if (*ithere >= 256) cstr[len] = ' ';
    226       else cstr[len] = (*ithere);
     235      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
     236      else {
     237    // put a space or a question mark depending on what
     238    // the character is. Question marks tell the user that
     239    // they are missing some information.
     240    if (is_unicode_space (*ithere)) cstr[len] = ' ';
     241    else cstr[len] = '?';
     242      }
    227243      len++;
    228244      ithere++;
     
    241257  while (ithere != itend)
    242258    {
    243       if (*ithere >= 256) cstr[len] = ' ';
    244       else cstr[len] = (*ithere);
     259      if (*ithere < 256) cstr[len] = (unsigned char)(*ithere);
     260      else {
     261    // put a space or a question mark depending on what
     262    // the character is. Question marks tell the user that
     263    // they are missing some information.
     264    if (is_unicode_space (*ithere)) cstr[len] = ' ';
     265    else cstr[len] = '?';
     266      }
    245267      len++;
    246268      ithere++;
     
    425447void inconvertclass::convert (text_t &output, status_t &status)
    426448{
     449  output.clear();
     450
    427451  if (start == NULL || len == 0)
    428452    {
     
    442466  start = (char *)here; // save current position
    443467  status = finished;
     468}
     469
     470// will treat the text_t as a 8-bit string and convert
     471// it to a 16-bit string using the about convert method.
     472text_t inconvertclass::convert (const text_t &t) {
     473  text_t out;
     474  text_t tmpout;
     475  status_t status;
     476  text_t::const_iterator here = t.begin();
     477  text_t::const_iterator end = t.end();
     478  unsigned char cbuf[256];
     479  size_t cbuflen = 0;
     480 
     481  while (here != end) {
     482    while (here != end && cbuflen < 256) {
     483      cbuf[cbuflen++] = (unsigned char)(*here & 0xff);
     484      here++;
     485    }
     486
     487    if (cbuflen > 0) {
     488      setinput ((char *)cbuf, cbuflen);
     489      status = unfinished;
     490      while (status == unfinished) {
     491    convert (tmpout, status);
     492    out += tmpout;
     493      }
     494      cbuflen = 0;
     495    }
     496  }
     497
     498  out.setencoding (0); // unicode
     499
     500  return out;
    444501}
    445502
     
    498555    {
    499556      if (*texthere < 256) *uoutput = (unsigned char)(*texthere);
    500       else *uoutput = 32; // put a space where a char >= 256 exists
     557      else {
     558    // put a space or a question mark depending on what
     559    // the character is. Question marks tell the user that
     560    // they are missing some information.
     561    if (is_unicode_space (*texthere)) *uoutput = ' ';
     562    else *uoutput = '?';
     563      }
    501564      ++uoutput;
    502565      ++len;
     
    508571}
    509572
     573// will convert the 16-bit string to a 8-bit stream
     574// and place the result in a text_t. This method uses
     575// the above convert function.
     576text_t outconvertclass::convert (const text_t &t) {
     577  text_t out;
     578  unsigned char cbuf[256];
     579  size_t cbuflen = 0;
     580  status_t status = unfinished;
     581
     582  setinput ((text_t *)&t); // discard constant
     583  while (status == unfinished) {
     584    convert ((char *)cbuf, 256, cbuflen, status);
     585    out.appendcarr ((char *)cbuf, cbuflen);
     586  }
     587
     588  out.setencoding (1); // other encoding
     589 
     590  return out;
     591}
     592
     593
    510594void outconvertclass::setostream (ostream *theouts)
    511595{
  • trunk/gsdl/lib/text_t.h

    r12 r94  
    7575protected:
    7676  usvector text;
     77  unsigned short encoding; // 0 = unicode, 1 = other
    7778 
    7879public:
     
    8283  text_t (char *s); // assumed to be a normal c string
    8384
     85  void setencoding (unsigned short theencoding) {encoding=theencoding;};
     86  unsigned short getencoding () {return encoding;};
     87
    8488  // basic container support
    8589  iterator begin () {return text.begin();}
     
    9296  void push_back(unsigned short c) {text.push_back(c);}
    9397  void pop_back() {text.pop_back();}
    94   text_t &operator=(const text_t &x) {text=x.text;return *this;}
     98  text_t &operator=(const text_t &x) {text=x.text; encoding=x.encoding; return *this;}
    9599  reference operator[](size_type n) {return text[n];};
    96100  const_reference operator[](size_type n) const {return text[n];};
     
    255259  void reset ();
    256260  void setinput (char *thestart, size_t thelen);
     261
     262  // output will be cleared before the conversion
    257263  virtual void convert (text_t &output, status_t &status);
     264
     265  // will treat the text_t as a 8-bit string and convert
     266  // it to a 16-bit string using the about convert method.
     267  text_t convert (const text_t &t);
    258268
    259269protected:
     
    290300            size_t &len, status_t &status);
    291301
     302  // will convert the 16-bit string to a 8-bit stream
     303  // and place the result in a text_t. This method uses
     304  // the above convert function.
     305  text_t convert (const text_t &t);
     306
    292307  void setostream (ostream *theouts);
    293308  ostream *getostream ();
  • trunk/gsdl/src/library/Makefile

    r91 r94  
    2525AR = ar
    2626CC = gcc
    27 CCFLAGS = -O2 -g
     27CCFLAGS = -g
    2828DEFS = -DNZDL -DQUIET -DSHORT_SUFFIX -DPARADOCNUM -DUSE_FASTCGI
    2929RANLIB = ranlib
     
    7373    rm -f $(OBJECTS)
    7474
     75install:
     76
    7577depend:
    7678    makedepend -Y -- $(DEFS) $(INCLUDES) $(CCFLAGS) -- $(SOURCES)
     
    8082# DO NOT DELETE
    8183
    82 browse.o: browse.h ../../lib/text_t.h gdbmclass.h locateinfo.h cgiargs.h
    83 gdbmclass.o: ../../lib/text_t.h gdbmclass.h locateinfo.h
     84browse.o: browse.h ../../lib/text_t.h gdbmclass.h cgiargs.h
     85gdbmclass.o: ../../lib/text_t.h gdbmclass.h
     86gdbmclass.o: ../../packages/mg-1.3d/lib/unitool.h ../../lib/gsdlunicode.h
     87gdbmclass.o: ../../lib/fileutil.h
    8488cgiargs.o: cgiargs.h ../../lib/text_t.h
    8589querycache.o: querycache.h ../../lib/text_t.h queryinfo.h
     
    106110queryinfo.o: queryinfo.h ../../lib/text_t.h
    107111libinterface.o: libinterface.h browse.h ../../lib/text_t.h gdbmclass.h
    108 libinterface.o: locateinfo.h cgiargs.h queryinfo.h mgsearch.h querycache.h
    109 libinterface.o: ../../lib/display.h
     112libinterface.o: cgiargs.h queryinfo.h mgsearch.h querycache.h
     113libinterface.o: ../../lib/display.h ../../lib/gsdlunicode.h
     114libinterface.o: ../../lib/fileutil.h ../../lib/cfgread.h
    110115mgsearch.o: mgq.h mgsearch.h ../../lib/text_t.h querycache.h queryinfo.h
    111 mgsearch.o: locateinfo.h
    112 locateinfo.o: locateinfo.h ../../lib/text_t.h
    113 cgiwrap.o: libinterface.h browse.h ../../lib/text_t.h gdbmclass.h
    114 cgiwrap.o: locateinfo.h cgiargs.h queryinfo.h mgsearch.h querycache.h
    115 cgiwrap.o: ../../lib/display.h
     116mgsearch.o: locateinfo.h ../../lib/gsdlunicode.h
     117mgsearch.o: ../../packages/mg-1.3d/lib/unitool.h
     118locateinfo.o: locateinfo.h ../../lib/text_t.h ../../lib/fileutil.h
     119cgiwrap.o: libinterface.h browse.h ../../lib/text_t.h gdbmclass.h cgiargs.h
     120cgiwrap.o: queryinfo.h mgsearch.h querycache.h ../../lib/display.h
     121cgiwrap.o: ../../lib/gsdlunicode.h
  • trunk/gsdl/src/library/cgiargs.cpp

    r4 r94  
    11#include "cgiargs.h"
    2 
     2#include "gsdlunicode.h"
    33
    44 
     
    6464ostream &operator<<(ostream &outs, const cgiargsclass &args)
    6565{
    66   outconvertclass text_t2ascii;
     66  utf8outconvertclass text_t2utf8;
    6767  cgiargsclass::const_iterator here = args.begin ();
    6868  cgiargsclass::const_iterator end = args.end ();
     
    7272  while (here != end)
    7373    {
    74       outs << text_t2ascii << " \"" << (*here).first << "\"=\"" <<
     74      outs << text_t2utf8 << " \"" << (*here).first << "\"=\"" <<
    7575    (*here).second << "\"\n";
    7676      here++;
  • trunk/gsdl/src/library/libinterface.cpp

    r93 r94  
    99#include "cfgread.h"
    1010#include "gsdlunicode.h"
     11#include "unitool.h"
    1112
    1213#include <assert.h>
     
    107108      // convert %xx and + to their appropriate equivalents
    108109      decode (value);
     110      value.setencoding(1); // other encoding
    109111      // store this key=value pair
    110112      if (!key.empty()) args.setarg (key, value);
     
    112114}
    113115
    114 text_t cgisafe (text_t &intext)
     116text_t cgisafe (const text_t &intext)
    115117{
    116118  text_t outtext;
    117119
    118   text_t::iterator here = intext.begin ();
    119   text_t::iterator end = intext.end ();
     120  text_t::const_iterator here = intext.begin ();
     121  text_t::const_iterator end = intext.end ();
    120122  unsigned short c;
    121123  text_t ttmp;
     
    150152libinterface::libinterface() {
    151153  browse = NULL;
    152   gbinconvert = NULL;
    153   gboutconvert = NULL;
    154154}
    155155
     
    277277  srand(time(NULL));
    278278
    279   gbinconvert = new gbinconvertclass (gsdlhome);
    280 
    281279  utf8outconvert.set_rzws(1);
    282   gboutconvert = new gboutconvertclass (gsdlhome);
    283   if (gboutconvert != NULL) gboutconvert->set_rzws(1);
     280  gboutconvert.set_rzws(1);
    284281
    285282  return collection_init(collection);
     
    305302  check_args (args);
    306303
     304  // get the input encoding
     305  text_t &arg_w = args["w"];
     306  inconvertclass *inconvert = NULL;
     307  if (arg_w == "8") {
     308    inconvert = &utf8inconvert;
     309  } else if (arg_w == "g") {
     310    // The map files will only be loaded the first time they are
     311    // needed. The loading is done here to reduce the memory load
     312    // for collections which don't need to convert to GB.
     313    gbinconvert.loadmapfile (gsdlhome, "gbku", 0x25a1);
     314    inconvert = &gbinconvert;
     315  } else {
     316    inconvert = &asciiinconvert; // default
     317  }
     318
     319  // see if the next page will have a different encoding
     320  if (args.getarg("nw") != NULL) args["w"] = args["nw"];
     321
     322  // convert arguments which aren't in unicode to unicode
     323  args_tounicode (args, *inconvert);
     324
     325  // remember the state of the compressed arguments
    307326  lastcomparg = get_compressed_args (args);
    308 
    309327  logout << args;
    310328
    311329  // get the output encoding
    312   text_t &arg_n = args["n"];
     330  text_t &arg_nw = args["w"];
    313331  outconvertclass *outconvert = NULL;
    314   if (arg_n == "8") {
     332  if (arg_nw == "8") {
    315333    outconvert = &utf8outconvert;
    316   } else if (arg_n == "g" && gboutconvert != NULL) {
    317     outconvert = gboutconvert;
    318   }
    319   else outconvert = &asciioutconvert; // default
    320 
    321   if (outconvert == NULL) return err;
     334  } else if (arg_nw == "g") {
     335    gboutconvert.loadmapfile (gsdlhome, "ugbk", 0xa1f5);
     336    outconvert = &gboutconvert;
     337  } else {
     338    outconvert = &asciioutconvert; // default
     339  }
     340
    322341
    323342  // dispatch the request
     
    346365  argconfigstr =
    347366    "+a[p]"    // action: q=query, b=browse, t=targetdoc, p=page, a1=auxiliary
    348     "n[]"     // encoding: w=western, 8=utf8, 7=utf7, g=GB2312, k=GBK
     367    "w[]"      // encoding: w=western, 8=utf8, 7=utf7, g=GB2312, k=GBK
    349368    "t[1]"     // query type: 0=boolean, 1=ranked
    350369    "i[c]"     // index: c=chapter, p=paragraph, t=title, b=book
     
    479498{
    480499  args.setarg("c", get_collection_name());
    481   if (args["n"].empty()) args.setarg("n", cfg_info.defaultencoding);
    482 }
    483 
    484 
    485 void libinterface::define_general_macros (cgiargsclass &args, ostream &logout)
    486 {
     500  if (args["w"].empty()) args.setarg("w", cfg_info.defaultencoding);
     501}
     502
     503void libinterface::args_tounicode (cgiargsclass &args,
     504                   inconvertclass &inconvert) {
     505  utf8outconvertclass text_t2utf8;
     506  cgiargsclass::iterator here = args.begin();
     507  cgiargsclass::iterator end = args.end();
     508
     509  while (here != end) {
     510    if (here->second.getencoding() > 0) {
     511      here->second = inconvert.convert(here->second);
     512    }
     513   
     514    here++;
     515  }
     516}
     517
     518
     519void libinterface::define_general_macros (cgiargsclass &args, outconvertclass &outconvert,
     520                      ostream &logout) {
    487521  disp.setmacro("httpprefix", "Global", httpprefix);
    488522  disp.setmacro("gwcgi", "Global", gwcgi);
    489523
    490   disp.setmacro("collection", "Global", cgisafe(args["c"]));
     524  disp.setmacro("numdocs", "Global", (int)cfg_info.numdocs);
     525
     526  disp.setmacro("collection", "Global", cgisafe(outconvert.convert(args["c"])));
    491527  disp.setmacro("compressedoptions", "Global", get_compressed_args(args));
    492   disp.setmacro("urlsafequerystring", "Global", cgisafe(args["q"]));
     528  disp.setmacro("urlsafequerystring", "Global", cgisafe(outconvert.convert(args["q"])));
    493529
    494530  // need to escape any quotes in querystring to prevent them upsetting the html
     
    498534  while (here != end) {
    499535    if (*here == '"') querystring += "&quot;";
     536    else if (*here == '&') querystring += "&amp;";
     537    else if (*here == '<') querystring += "&lt;";
     538    else if (*here == '>') querystring += "&gt;";
    500539    else querystring.push_back(*here);
    501540    here ++;
     
    503542  disp.setmacro("querystring", "Global", querystring);
    504543
    505 
    506544  if (args.getintarg("x") == 0) disp.setmacro("notdetached", "Global", "1");
    507545  if (args["d"][0] == 'T') disp.setmacro("istitle", "Global", "1");
     
    517555// prepare_page prepares to write out a page using the current
    518556// page parameters and defines any general macros
    519 void libinterface::prepare_page (cgiargsclass &args, ostream &logout)
    520 {
     557void libinterface::prepare_page (cgiargsclass &args, outconvertclass &outconvert,
     558                 ostream &logout) {
    521559  // get page parameters
    522560  text_t pageparams = text_t("collection=") + args["c"];
     
    527565
    528566  disp.openpage(pageparams, MACROPRECEDENCE);
    529   define_general_macros(args, logout);
     567  define_general_macros(args, outconvert, logout);
    530568  define_collection_macros(args, logout);
    531569}
     
    717755
    718756  // prepare to print out the page
    719   prepare_page(args, logout);
     757  prepare_page(args, outconvert, logout);
    720758  define_query_macros(args, queryparams, queryresults, logout);
    721759
     
    764802  gdbm_info info;
    765803 
    766   prepare_page(args, logout);
     804  prepare_page(args, outconvert, logout);
    767805 
    768806  // get browse bar unless page has been detached
     
    806844  do_query(args, queryparams, queryresults, logout);
    807845
    808   prepare_page(args, logout);
     846  prepare_page(args, outconvert, logout);
    809847 
    810848  if (args["g"][1] == '0') {
     
    9761014  text_t &arg_p = args["p"];
    9771015
    978   prepare_page(args, logout);
     1016  prepare_page(args, outconvert, logout);
    9791017
    9801018  if (arg_p == "preferences")
     
    9991037  text_t word, buffer;
    10001038  while (here != end) {
    1001     if (((*here >= 65) && (*here <= 90)) ||
    1002     ((*here >= 97) && (*here <= 122)) ||
    1003     ((*here >= '0') && (*here <= '9')) ||
    1004     ((*here >= 192) && (*here <= 214)) ||
    1005     ((*here >= 216) && (*here <= 246)) ||
    1006     ((*here >= 248) && (*here <= 255))) {
     1039    if (is_unicode_letdig(*here)) {
    10071040      // not word boundary
    10081041      word.push_back(*here);
     
    10691102  disp.setmacro ("stemoption", "preferences", stemoption);
    10701103
     1104
     1105  // the encodingoption
     1106  text_t encodingoption;
     1107  const text_t &arg_w = args["w"];
     1108
     1109  encodingoption += "\n<select name=\"nw\">\n";
     1110  encodingoption += "  <option value=\"w\"";
     1111  if (arg_w == "w") encodingoption += " selected";
     1112  encodingoption += ">Western (ISO-8859-1)\n";
     1113  encodingoption += "  <option value=\"g\"";
     1114  if (arg_w == "g") encodingoption += " selected";
     1115  encodingoption += ">Simplified Chinese (GB2312)\n";
     1116  encodingoption += "  <option value=\"8\"";
     1117  if (arg_w == "8") encodingoption += " selected";
     1118  encodingoption += ">Unicode (UTF-8)\n";
     1119  encodingoption += "</select>\n";
     1120
     1121  disp.setmacro ("encodingoption", "preferences", encodingoption);
    10711122
    10721123  // the maxdocoption
  • trunk/gsdl/src/library/libinterface.h

    r93 r94  
    110110  inconvertclass asciiinconvert;
    111111  utf8inconvertclass utf8inconvert;
    112   gbinconvertclass *gbinconvert;
     112  mapinconvertclass gbinconvert;
    113113  outconvertclass asciioutconvert;
    114114  utf8outconvertclass utf8outconvert;
    115   gboutconvertclass *gboutconvert;
     115  mapoutconvertclass gboutconvert;
    116116
    117117
     
    130130  virtual void add_default_args (cgiargsclass &args);
    131131  virtual void check_args (cgiargsclass &args);
     132  virtual void args_tounicode (cgiargsclass &args, inconvertclass &inconvert);
    132133
    133   virtual void define_general_macros (cgiargsclass &args, ostream &logout);
    134   virtual void prepare_page (cgiargsclass &args, ostream &logout);
     134  virtual void define_general_macros (cgiargsclass &args, outconvertclass &outconvert,
     135                      ostream &logout);
     136  virtual void prepare_page (cgiargsclass &args, outconvertclass &outconvert,
     137                 ostream &logout);
    135138
    136139
  • trunk/gsdl/src/library/mgq.c

    r4 r94  
    151151              int (*sender)(char *,int,int,float,void *), void *ptr)
    152152{
    153   int i;  char *word;
     153  int i;
    154154  for (i = 0; i < qtl->num; i++)
    155155    if (sender != NULL) {
    156       word = word2str(qtl->QTE[i].Term);
    157       (* sender)(word, strlen(word), qtl->QTE[i].Count, (float)0.0, ptr);
     156      /*      word = word2str(qtl->QTE[i].Term);
     157          (* sender)(word, strlen(word), qtl->QTE[i].Count, (float)0.0, ptr); */
     158      (* sender)(qtl->QTE[i].Term+1, qtl->QTE[i].Term[0],
     159         qtl->QTE[i].Count, (float)0.0, ptr);
    158160    }
    159161}
     
    164166{
    165167  int i = 0;
    166   char *word;
    167 
    168168  if (sender == NULL) return;
    169169  for (i = 0; i < qtl->num; i++)
    170170    {
    171       word = word2str(qtl->TE[i].Word);
    172       (* sender)(word, strlen(word), qtl->TE[i].Count, (float)0.0, ptr);
     171      /*      word = word2str(qtl->TE[i].Word);
     172          (* sender)(word, strlen(word), qtl->TE[i].Count, (float)0.0, ptr);*/
     173      (* sender)(qtl->TE[i].Word+1, qtl->TE[i].Word[0],
     174         qtl->TE[i].Count, (float)0.0, ptr);
    173175    }
    174176}
  • trunk/gsdl/src/library/mgsearch.cpp

    r91 r94  
    7373  queryresultsclass *queryresults = (queryresultsclass *)info;
    7474
     75  text_t term;
     76  term.setcarr(Word, ULen);
    7577  termfreqclass termfreq;
    76   termfreq.termstr.setcarr(Word, ULen);
     78  termfreq.termstr = to_uni(term);
    7779  termfreq.termfreq = Freq;
    7880  queryresults->terms.push_back(termfreq);
     
    8587          float Weight, void *info) {
    8688
    87   // convert term from utf8 to unicode
    8889  text_t term;
    89   utf8inconvertclass inconvert;
    90   convertclass::status_t status;
    91   inconvert.reset ();
    92   inconvert.setinput (Word, ULen);
    93   inconvert.convert (term, status);
    94 
     90  term.setcarr(Word, ULen);
    9591  queryresultsclass *queryresults = (queryresultsclass *)info;
    96   queryresults->termvariants.push_back(term);
     92  queryresults->termvariants.push_back(to_uni(term));
    9793
    9894  return 0;
     
    255251
    256252  // quotedquery will be deleted on the next call to this function
    257   quotedquery = ttquotedquery.getcstr ();
    258   char *querystring = ttquerystring.getcstr();
     253  quotedquery = to_utf8(ttquotedquery).getcstr ();
     254  char *querystring = to_utf8(ttquerystring).getcstr();
    259255 
    260256  // submit the query
     
    318314
    319315
    320 void mgsearchclass::filterquery (text_t &ttquerystring)
    321 {
    322 
     316void mgsearchclass::filterquery (text_t &ttquerystring) {
    323317  text_t::iterator ithere = ttquerystring.begin ();
    324318  text_t::iterator itend = ttquerystring.end ();
    325   unsigned short c;
    326 
    327   // remove all non alphanumeric characters below 127
    328   while (ithere != itend)
    329     {
    330       c = *ithere;
    331 
    332       //      if ((c <= 127) && !((c >= '0' && c <= '9') ||
    333       //              (c >= 'A' && c <= 'Z') ||
    334       //              (c >= 'a' && c <= 'z')))
    335       if (!(((c >= 65) && (c <= 90)) ||
    336         ((c >= 97) && (c <= 122)) ||
    337         ((c >= 192) && (c <= 214)) ||
    338         ((c >= 216) && (c <= 246)) ||
    339         ((c >= 248) && (c <= 255)) ||
    340         ((c >= '0') && (c <= '9')) ||
    341         (c == 176)))
    342     (*ithere) = ' ';
    343      
    344       ithere++;
    345     }
     319 
     320  // remove all non alphanumeric characters
     321  while (ithere != itend) {
     322    if (!is_unicode_letdig(*ithere)) (*ithere) = ' ';
     323    ithere++;
     324  }
    346325}
    347326
Note: See TracChangeset for help on using the changeset viewer.