source: main/trunk/greenstone2/common-src/src/lib/gsdlunicode.cpp@ 21325

Last change on this file since 21325 was 21325, checked in by ak19, 14 years ago

Changes to makefiles, configure files, and source code to work with the new configure flags that allow indexers to be individually compiled up by setting each indexer to be enabled or disabled (enable-mg, enable-mgpp, enable-lucene).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1/**********************************************************************
2 *
3 * gsdlunicode.cpp --
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#include "gsdlunicode.h"
27
28
29// unitool is currently in mg, if mg is not being used it should
30// be moved into GSDLHOME/lib
31// A copy of mgpp's unitool has now been moved into common-src/src/lib/
32#include "unitool.h"
33
34#include "fileutil.h"
35
36#include <stdio.h>
37
38#if defined(GSDL_USE_OBJECTSPACE)
39# include <ospace\std\iostream>
40# include <ospace\std\fstream>
41#elif defined(GSDL_USE_IOS_H)
42# include <iostream.h>
43# include <fstream.h>
44#else
45# include <iostream>
46# include <fstream>
47#endif
48
49
50// converts a unicode encode text_t string to a utf-8
51// encoded text_t string
52text_t to_utf8 (text_t::const_iterator here, text_t::const_iterator end) {
53 text_t out;
54
55 unsigned char thischar[MAXUTF8CHARLEN];
56 int i, charlen;
57
58 while (here != end) {
59 charlen = output_utf8_char (*here, thischar, &thischar[MAXUTF8CHARLEN-1]);
60 for (i=0; i<charlen; ++i) out.push_back(thischar[i]);
61 ++here;
62 }
63
64 return out;
65}
66
67// converts a utf-8 encoded text_t string to a unicode
68// encoded text_t string
69text_t to_uni (const text_t &in) {
70 text_t out;
71 unsigned char *in_cstr = (unsigned char *)in.getcstr();
72 unsigned char *here = in_cstr;
73 unsigned char *end = in_cstr;
74
75 unsigned short unichar;
76 int charlen = 0;
77
78 // get the last valid character in the string
79 while (*end != '\0') ++end;
80 --end;
81
82 while ((charlen = parse_utf8_char (here, end, &unichar)) > 0) {
83 out.push_back(unichar);
84 here += charlen;
85 }
86
87 delete []in_cstr;
88
89 return out;
90}
91
92
93// this works for all unicode values < 65536...
94void utf16outconvertclass::convert (char *out, size_t maxlen, size_t &len, status_t &status) {
95 // we should already have text_t* input set...
96 if (input == NULL || out == NULL)
97 {
98 status = finished;
99 return;
100 }
101 unsigned char *output = (unsigned char *)out;
102 text_t::iterator textend = input->end();
103 len = 0;
104 if (maxlen % 2) --maxlen; // we need an even number of output bytes...
105 while ((len < maxlen) && (texthere != textend)) {
106 unsigned short int uni_char=(unsigned short int) *texthere;
107 // big endian utf-16...
108 if (uni_char < 256) {
109 out[len]=0;
110 out[len+1]=uni_char;
111 } else {
112 out[len]=uni_char >> 8;
113 out[len+1]=uni_char & 255;
114 }
115 len+=2;
116 ++texthere;
117 }
118 if (texthere==textend)
119 status=finished;
120 else
121 status=unfinished;
122}
123
124
125utf8inconvertclass::utf8inconvertclass () {
126 utf8buflen = 0;
127}
128
129void utf8inconvertclass::reset () {
130 start = NULL;
131 len = 0;
132 utf8buflen=0;
133}
134
135void utf8inconvertclass::convert (text_t &output, status_t &status) {
136 output.clear();
137 output.reserve (len/3);
138
139 if (start == NULL || len == 0) {
140 if (utf8buflen == 0) status = finished;
141 else status = stopped;
142 return;
143 }
144
145 // don't want any funny sign conversions happening
146 unsigned char *here = (unsigned char *)start;
147 unsigned char *end = here+len-1;
148 unsigned short c;
149 size_t realcharlen;
150
151 size_t charlen = getutf8charlen ();
152 while (len > 0) {
153 if (charlen == 0) {
154 // start parsing a new character
155 utf8buflen = 0;
156
157 // fast common case
158 while (len > 3) {
159 realcharlen = parse_utf8_char (here, end, &c);
160 output.push_back (c);
161 here += realcharlen;
162 len -= realcharlen;
163 }
164
165 utf8buf[utf8buflen++] = *here;
166 ++here;
167 --len;
168 charlen = getutf8charlen ();
169
170 } else if (utf8buflen < charlen) {
171 // assumes charlen is always less than MAXUTF8CHARLEN
172 utf8buf[utf8buflen++] = *here;
173 ++here;
174 --len;
175 }
176
177 if (utf8buflen == charlen) {
178 // got a complete character
179 realcharlen = parse_utf8_char (utf8buf, &utf8buf[utf8buflen-1], &c);
180 output.push_back (c);
181
182 // move any unparsed characters. If an error occurred some of
183 // the characters might be unused.
184 int i;
185 int diff = utf8buflen - realcharlen;
186 for (i=0; i < diff; ++i) utf8buf[i] = utf8buf[i+diff];
187 utf8buflen = diff;
188 charlen = getutf8charlen ();
189 }
190 }
191
192 start = (char *)here; // save current position
193
194 if (utf8buflen == 0) status = finished;
195 else status = stopped;
196}
197
198
199// returns the length that the current contents of the
200// utf8buf should be
201size_t utf8inconvertclass::getutf8charlen () {
202 if (utf8buflen == 0) return 0;
203
204 // one byte character
205 if (utf8buf[0] < 0x80) return 1;
206
207 // error, is not the start of a utf-8 character
208 if (utf8buf[0] < 0xc0) return 1;
209
210 // two bute character
211 if (utf8buf[0] < 0xe0) return 2;
212
213 // three byte character
214 if (utf8buf[0] < 0xf0) return 3;
215
216 // error, character too long for unicode
217 return 1;
218}
219
220
221void utf8outconvertclass::reset () {
222 input = NULL;
223 outs = NULL;
224 utf8buflen = 0;
225 utf8bufhere = 0;
226}
227
228// note that convert does not null-terminate the
229// output array of characters
230void utf8outconvertclass::convert (char *output, size_t maxlen,
231 size_t &len, status_t &status) {
232 if (input == NULL || output == NULL) {
233 if (utf8buflen == 0) status = finished;
234 else status = unfinished;
235 return;
236 }
237
238 // don't want any funny sign conversions happening
239 unsigned char *uoutput = (unsigned char *)output;
240 text_t::iterator textend = input->end();
241 len = 0;
242 while (len < maxlen) {
243 // empty the contents of the internal buffer
244 if (utf8buflen > 0) {
245 while (len < maxlen && utf8bufhere < utf8buflen) {
246 *uoutput = utf8buf[utf8bufhere];
247 ++uoutput;
248 ++len;
249 ++utf8bufhere;
250 }
251
252 if (utf8bufhere == utf8buflen) {
253 utf8bufhere = 0;
254 utf8buflen = 0;
255 }
256 }
257
258 // fill up the buffer with the next character
259 if (utf8buflen == 0) {
260 if (texthere == textend) break; // finished!
261 if (!rzws || (*texthere != 0x200b))
262 utf8buflen = output_utf8_char (*texthere, utf8buf,
263 &utf8buf[MAXUTF8CHARLEN-1]);
264 ++texthere;
265 utf8bufhere = 0;
266 }
267 }
268
269 if (texthere == textend && utf8buflen == 0) status = finished;
270 else status = unfinished;
271}
272
273
274
275
276
277
278mapdata_t::mapdata_t () {
279
280 // reset all the map ptrs to be NULL
281 for (int i=0; i<256; ++i) {
282 ptrs[i] = (unsigned short *)NULL;
283 }
284
285 // say nothing has been loaded
286 loaded = false;
287}
288
289
290mapconvert::mapconvert () {
291 absentc = 0;
292}
293
294// setmapfile will cause loadmapfile to be called when conversion is
295// needed
296bool mapconvert::setmapfile (const text_t &themapfile, unsigned short theabsentc) {
297 // check to see if the mapfile has been already loaded
298 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
299
300 unloadmapfile ();
301 mapfile = themapfile;
302 absentc = theabsentc;
303
304 return true;
305}
306
307
308
309// loadmapfile should be called before any conversion is done
310bool mapconvert::loadmapfile (const text_t &themapfile,
311 unsigned short theabsentc) {
312 FILE *mapfilein = (FILE *)NULL;
313
314 // check to see if the mapfile has been already loaded
315 if (mapdata.loaded && mapfile == themapfile && absentc == theabsentc) return true;
316
317 unloadmapfile ();
318 mapfile = themapfile;
319 absentc = theabsentc;
320
321 // open the map file
322 char *cfilename = mapfile.getcstr();
323 if (cfilename == (char *)NULL) return false;
324 mapfilein = fopen(cfilename, "rb");
325 delete []cfilename; cfilename = NULL;
326
327 if (mapfilein == (FILE *)NULL) return false;
328
329 unsigned char c, n1, n2;
330 unsigned short *arrptr;
331 int i;
332 c = fgetc (mapfilein);
333 while (!feof (mapfilein)) {
334 if (mapdata.ptrs[c] == (unsigned short *)NULL) {
335 // allocate a new array
336 arrptr = new unsigned short[256];
337 mapdata.ptrs[c] = arrptr;
338 } else arrptr = mapdata.ptrs[c];
339
340 // clear the array
341 for (i=0; i<256; ++i) arrptr[i] = 0;
342
343 // read in this block
344 n1 = fgetc (mapfilein);
345 n2 = fgetc (mapfilein);
346 i=0;
347 while (!feof (mapfilein)) {
348 arrptr[i] = ((unsigned short)n1 << 8) | (unsigned short)n2;
349
350 ++i;
351 if (i >= 256) break;
352 n1 = fgetc (mapfilein);
353 n2 = fgetc (mapfilein);
354 }
355
356 c = fgetc (mapfilein);
357 }
358
359 mapdata.loaded = true;
360
361 return true;
362}
363
364void mapconvert::unloadmapfile () {
365 if (!mapdata.loaded) return;
366
367 for (int i=0; i<256; ++i) {
368 if (mapdata.ptrs[i] != (unsigned short *)NULL) {
369 delete [] mapdata.ptrs[i];
370 mapdata.ptrs[i] = (unsigned short *)NULL;
371 }
372 }
373
374 mapdata.loaded = false;
375}
376
377
378unsigned short mapconvert::convert (unsigned short c) {
379 if (!mapdata.loaded) {
380 if (!mapfile.empty() && loadmapfile (mapfile, absentc)) {
381 // do nothing, successfully loaded database
382 } else return absentc;
383 }
384
385 if (c == 0) return 0; // 0 always maps to 0...
386
387 unsigned short n1 = c >> 8;
388 unsigned short n2 = c & 0xff;
389
390 unsigned short *arrptr = mapdata.ptrs[n1];
391 if (arrptr == (unsigned short *)NULL) return absentc;
392
393 if (arrptr[n2] == 0) return absentc;
394 return arrptr[n2];
395}
396
397text_t mapconvert::convert (const text_t &instr) {
398 if (!mapdata.loaded) return absentc;
399
400 text_t outstr;
401 text_t::const_iterator here = instr.begin();
402 text_t::const_iterator end = instr.end();
403
404 while (here != end) {
405 outstr.push_back(this->convert(*here));
406 ++here;
407 }
408
409 return outstr;
410}
411
412
413
414
415mapinconvertclass::mapinconvertclass () {
416 m_multibyte = 0;
417 mapbuflen = 0;
418}
419
420void mapinconvertclass::reset () {
421 start = NULL;
422 len = 0;
423 mapbuflen=0;
424}
425
426void mapinconvertclass::convert (text_t &output, status_t &status) {
427 output.clear();
428
429 if (start == NULL || len == 0) {
430 if (mapbuflen == 0) status = finished;
431 else status = stopped;
432 return;
433 }
434
435 // don't want any funny sign conversions happening
436 unsigned char *here = (unsigned char *)start;
437
438 size_t charlen = getmapcharlen ();
439 while (len > 0) {
440 if (charlen == 0) {
441 // start parsing a new character
442 mapbuflen = 0;
443 mapbuf[mapbuflen++] = *here;
444 ++here;
445 --len;
446 charlen = getmapcharlen ();
447
448 } else if (mapbuflen < charlen) {
449 // assumes charlen is always less than MAXMAPCHARLEN
450 mapbuf[mapbuflen++] = *here;
451 ++here;
452 --len;
453 }
454
455 if (mapbuflen == charlen) {
456 // got a complete character
457 if (charlen == 1) {
458 if (mapbuf[0] < 0x80) {
459 // ascii character
460 output.push_back (mapbuf[0]);
461 } else {
462 output.push_back (converter.convert((unsigned short)mapbuf[0]));
463 }
464
465 } else {
466 // two byte character
467 output.push_back (converter.convert(((unsigned short)mapbuf[0] << 8) |
468 (unsigned short)mapbuf[1]));
469 }
470
471 mapbuflen = 0;
472 charlen = 0;
473 }
474 }
475
476 start = (char *)here; // save current position
477
478 if (mapbuflen == 0) status = finished;
479 else status = stopped;
480}
481
482
483
484mapoutconvertclass::mapoutconvertclass () {
485 m_multibyte = 0;
486 mapbuflen=0;
487 mapbufhere=0;
488}
489
490void mapoutconvertclass::reset () {
491 input = NULL;
492 outs = NULL;
493 mapbuflen = 0;
494 mapbufhere = 0;
495}
496
497// note that convert does not null-terminate the
498// output array of characters
499void mapoutconvertclass::convert (char *output, size_t maxlen,
500 size_t &len, status_t &status) {
501 unsigned short outc;
502
503 if (input == NULL || output == NULL) {
504 if (mapbuflen == 0) status = finished;
505 else status = unfinished;
506 return;
507 }
508
509 // don't want any funny sign conversions happening
510 unsigned char *uoutput = (unsigned char *)output;
511 text_t::iterator textend = input->end();
512 len = 0;
513 while (len < maxlen) {
514 // empty the contents of the internal buffer
515 if (mapbuflen > 0) {
516 while (len < maxlen && mapbufhere < mapbuflen) {
517 *uoutput = mapbuf[mapbufhere];
518 ++uoutput;
519 ++len;
520 ++mapbufhere;
521 }
522
523 if (mapbufhere == mapbuflen) {
524 mapbufhere = 0;
525 mapbuflen = 0;
526 }
527 }
528
529 // fill up the buffer with the next character
530 if (mapbuflen == 0) {
531 if (texthere == textend) break; // finished!
532 if (!rzws || (*texthere != 0x200b)) {
533 if (*texthere < 0x80) {
534 mapbuf[0] = (unsigned char)*texthere;
535 mapbuflen = 1;
536 } else {
537 outc = converter.convert (*texthere);
538 if (m_multibyte) {
539 mapbuf[0] = (unsigned char)(outc >> 8);
540 mapbuf[1] = (unsigned char)(outc & 0xff);
541 mapbuflen = 2;
542 } else {
543 mapbuf[0] = outc;
544 mapbuflen = 1;
545 }
546 }
547 }
548
549 ++texthere;
550 mapbufhere = 0;
551 }
552 }
553
554 if (texthere == textend && mapbuflen == 0) status = finished;
555 else status = unfinished;
556}
557
558
559bool simplemapconvert::loadmapfile (bool in) {
560 if (loaded) return true;
561 if (mapfile.empty()) return false;
562
563 char *cfilename = mapfile.getcstr();
564#ifdef GSDL_USE_IOS_H
565 ifstream mapfilein (cfilename, ios::in | ios::nocreate);
566#else
567 ifstream mapfilein (cfilename, ios::in);
568#endif
569 delete []cfilename;
570 if (!mapfilein) return false;
571
572 char cline[2048];
573 text_t line;
574
575 while (!mapfilein.eof()) {
576 mapfilein.getline (cline, 2048);
577 line.clear();
578 line.appendcstr (cline);
579 if (line.empty()) continue;
580 // remove comments
581 text_t::iterator end = line.end();
582 text_t::iterator here = findchar (line.begin(), end, '#');
583 if (here != end) {
584 line.erase (here, end);
585 if (line.empty()) continue;
586 }
587
588 text_tarray parts;
589 splitchar (line.begin(), line.end(), '\t', parts);
590
591 // do some simple sanity checks
592 if (parts.size() < 2) continue;
593 text_t::iterator begin1 = parts[0].begin();
594 text_t::iterator begin2 = parts[1].begin();
595 if (*begin1 != '0' || *(begin1+1) != 'x') continue;
596 if (*begin2 != '0' || *(begin2+1) != 'x') continue;
597 char *from = parts[0].getcstr();
598 char *to = parts[1].getcstr();
599 unsigned int f = 0, t = 0;
600 sscanf (from, "%i", &f);
601 sscanf (to, "%i", &t);
602 delete []from;
603 delete []to;
604
605 if (in) mapping[(unsigned short)f] = (unsigned short)t;
606 else mapping[(unsigned short)t] = (unsigned short)f;
607 }
608
609 loaded = true;
610 return true;
611}
612
613unsigned short simplemapconvert::convert (unsigned short c, bool in) {
614
615 if (!loaded)
616 if (!loadmapfile(in)) return absentc;
617
618 return mapping[c];
619}
620
621
622void simplemapinconvertclass::convert (text_t &output, status_t &status) {
623 output.clear();
624
625 if (start == NULL || len == 0) {
626 status = finished;
627 return;
628 }
629
630 // don't want any funny sign conversions happening
631 unsigned char *here = (unsigned char *)start;
632 while (len > 0) {
633
634 if (*here < 0x80)
635 output.push_back (*here); // append this character
636 else
637 output.push_back (converter.convert(*here, true));
638
639 ++here;
640 --len;
641 }
642
643 start = (char *)here; // save current position
644 status = finished;
645}
646
647
648void simplemapoutconvertclass::convert (char *output, size_t maxlen,
649 size_t &len, status_t &status) {
650
651 if (input == NULL || output == NULL) {
652 status = finished;
653 return;
654 }
655
656 // don't want any funny sign conversions happening
657 unsigned char *uoutput = (unsigned char *)output;
658 text_t::iterator textend = input->end();
659 len = 0;
660 while ((len < maxlen) && (texthere != textend)) {
661
662 if (*texthere < 0x80) *uoutput = (unsigned char)(*texthere);
663 else *uoutput = converter.convert (*texthere, false);
664
665 ++uoutput;
666 ++len;
667 ++texthere;
668 }
669
670 if (texthere == textend) status = finished;
671 else status = unfinished;
672}
Note: See TracBrowser for help on using the repository browser.