/************************************************************************** * * TextGet.cpp -- Decompressing the text * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ // is important to be first, so we escape the truncation warning on VC++ #include "TextGet.h" // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif #include "mg_files.h" #include "netorder.h" #include "mg_errors.h" #include "locallib.h" #include "words.h" #include "local_strings.h" #include "bitio_m_stdio.h" typedef enum huff_type {lengths, chars}; static auxiliary_dict *LoadAuxDict (compression_dict &cd, FILE *text_aux_dict) { auxiliary_dict *ad; int i; if (!(ad = new auxiliary_dict)) { mg_errno = MG_NOMEM; return (NULL); } memset (ad, '\0', sizeof (*ad)); for (i = 0; i <= 1; ++i) { int j; u_char *pos; fread (&ad->afh[i], sizeof (aux_frags_header), 1, text_aux_dict); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(ad->afh[i].num_frags); NTOHUL(ad->afh[i].mem_for_frags); if (!(ad->word_data[i] = new u_char[ad->afh[i].mem_for_frags])) { mg_errno = MG_NOMEM; delete ad; return (NULL); } if (!(ad->words[i] = new u_char* [ad->afh[i].num_frags])) { mg_errno = MG_NOMEM; delete ad; return (NULL); } fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), text_aux_dict); pos = ad->word_data[i]; for (j = 0; j < (int)ad->afh[i].num_frags; ++j) { ad->words[i][j] = pos; pos += *pos + 1; } if (cd.cdh.novel_method == MG_NOVEL_HYBRID) { int num; num = 1; ad->blk_start[i][0] = 0; ad->blk_end[i][0] = cd.cdh.num_words[i] - 1; while (num < 33) { ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1; ad->blk_end[i][num] = ad->blk_start[i][num] + (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2; ++num; } } } return (ad); } static u_char ***ReadInWords (FILE *dict, compression_dict &cd, comp_frags_header *cfh, u_char **escape) { int i, lookback; int ptrs_reqd = 0; int mem_reqd = 0; int num_set[MAX_HUFFCODE_LEN + 1]; u_char *next_word[MAX_HUFFCODE_LEN + 1]; u_char **vals; u_char ***values; u_char word[MAXWORDLEN + 1]; u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1]; lookback = cd.cdh.lookback; for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; ++i) { ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback; mem_reqd += cfh->huff_words_size[i]; } if (!(vals = new u_char* [ptrs_reqd])) return (NULL); if (!(values = new u_char** [MAX_HUFFCODE_LEN + 1])) return (NULL); if (!(next_word[0] = new u_char[mem_reqd])) return (NULL); cd.MemForCompDict += ptrs_reqd * sizeof (*vals) + (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) + mem_reqd; values[0] = vals; values[0][0] = next_word[0]; for (i = 1; i <= cfh->hd.maxcodelen; ++i) { int next_start = (values[i - 1] - vals) + ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback); values[i] = &vals[next_start]; next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1]; values[i][0] = next_word[i]; } memset (num_set, '\0', sizeof (num_set)); for (i = 0; i < cfh->hd.num_codes; ++i) { register int val, copy; register int len = cfh->hd.clens[i]; val = getc (dict); copy = (val >> 4) & 0xf; val &= 0xf; fread (word + copy + 1, sizeof (u_char), val, dict); *word = val + copy; if ((num_set[len] & ((1 << lookback) - 1)) == 0) { values[len][num_set[len] >> lookback] = next_word[len]; memcpy (next_word[len], word, *word + 1); if (escape && i == cfh->hd.num_codes - 1) *escape = next_word[len]; next_word[len] += *word + 1; } else { copy = prefixlen (last_word[len], word); memcpy (next_word[len] + 1, word + copy + 1, *word - copy); *next_word[len] = (copy << 4) + (*word - copy); if (escape && i == cfh->hd.num_codes - 1) *escape = next_word[len]; next_word[len] += (*word - copy) + 1; } memcpy (last_word[len], word, *word + 1); ++num_set[len]; } if (cfh->hd.clens) delete []cfh->hd.clens; cfh->hd.clens = NULL; return values; } static int Load_Comp_HuffData(compression_dict &cd, int which, FILE *dict, huff_type type) { huff_data * hd; u_long ** vals; if (!(hd = new huff_data)) return 1; cd.MemForCompDict += sizeof (huff_data); if (Read_Huffman_Data (dict, hd, &cd.MemForCompDict, NULL) == -1) return 2; if (!(vals = Generate_Huffman_Vals (hd, &cd.MemForCompDict))) return 3; if (hd->clens) delete []hd->clens; hd->clens = NULL; if (type == chars) { cd.chars_huff[which] = hd; cd.chars_vals[which] = vals; } else { cd.lens_huff[which] = hd; cd.lens_vals[which] = vals; } return 0; } static int Load_Comp_FragsHeader(compression_dict &cd, int which, int getEscape, FILE *dict) { if (!(cd.cfh[which] = new comp_frags_header)) return 1; cd.MemForCompDict += sizeof (*cd.cfh[which]); if (Read_cfh (dict, cd.cfh[which], &cd.MemForCompDict, NULL) == -1) return 2; if (!(cd.values[which] = ReadInWords (dict, cd, cd.cfh[which], getEscape == 0 ? NULL : &cd.escape[which]))) return 3; return 0; } static bool LoadSlowCompDict (FILE *dict, FILE *aux_dict, compression_dict &cd) { if (dict == NULL) return false; int which; memset (&cd, '\0', sizeof (compression_dict)); cd.MemForCompDict = sizeof (compression_dict); if (Read_cdh (dict, &cd.cdh, &cd.MemForCompDict, NULL) == -1) return false; for (which = 0; which < 2; ++which) switch (cd.cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { if (Load_Comp_FragsHeader(cd, which, 0, dict) != 0) return false; cd.escape[which] = NULL; } break; case MG_PARTIAL_DICTIONARY: { if (cd.cdh.num_words[which]) { if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0) return false; } if (Load_Comp_HuffData(cd, which, dict, chars) != 0) return false; if (Load_Comp_HuffData(cd, which, dict, lengths) != 0) return false; } break; case MG_SEED_DICTIONARY: { if (cd.cdh.num_words[which]) { if (Load_Comp_FragsHeader(cd, which, 1, dict) != 0) return false; } switch (cd.cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: if (Load_Comp_HuffData(cd, which, dict, chars) != 0) return false; if (Load_Comp_HuffData(cd, which, dict, lengths) != 0) return false; break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; } break; } } if (cd.cdh.novel_method == MG_NOVEL_DELTA || cd.cdh.novel_method == MG_NOVEL_HYBRID) { if (!aux_dict) { mg_errno = MG_NOFILE; cd.Clear(); return false; } if (!(cd.ad = LoadAuxDict (cd, aux_dict))) { cd.Clear(); return false; } } mg_errno = MG_NOERROR; cd.fast_loaded = 0; return true; } #define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*)) #define IS_FIXUP(p) ((fixup[WORDNO(p,cd)/8] & (1<<(WORDNO(p,cd) & 7))) != 0) // fast loading really needs to be totally re-writen. "Unloading" the // text data will currently cause a crash because memory is being // deleted multiple times (and probably a zillion other reasons). static bool LoadFastCompDict (FILE *text_fast_comp_dict, compression_dict &_cd) { if (text_fast_comp_dict == NULL) return false; u_long *p, *end; u_char *fixup; u_long mem; u_long fixup_mem; int i; /* [RPAP - Jan 97: Endian Ordering] */ fread (&mem, sizeof (mem), 1, text_fast_comp_dict); NTOHUL(mem); /* [RPAP - Jan 97: Endian Ordering] */ fread (&fixup_mem, sizeof (fixup_mem), 1, text_fast_comp_dict); NTOHUL(fixup_mem); /* [RPAP - Jan 97: Endian Ordering] */ compression_dict *cd; if (!(cd = (compression_dict *)malloc (mem))) { mg_errno = MG_NOMEM; return false; } end = (u_long *) (((u_char *) cd) + mem); fread (cd, sizeof (u_char), mem, text_fast_comp_dict); if (!(fixup = new u_char[fixup_mem])) { mg_errno = MG_NOMEM; return false; } fread (fixup, fixup_mem, sizeof (u_char), text_fast_comp_dict); for (p = (u_long *) cd; (u_long) p < (u_long) end; ++p) if (IS_FIXUP (p)) { NTOHUL(*p); /* [RPAP - Jan 97: Endian Ordering] */ *p = *p + (u_long) cd; } /* [RPAP - Jan 97: Endian Ordering] */ /* cdh */ NTOHUL(cd->cdh.dict_type); NTOHUL(cd->cdh.novel_method); for (i = 0; i < TEXT_PARAMS; ++i) NTOHUL(cd->cdh.params[i]); NTOHUL(cd->cdh.num_words[0]); NTOHUL(cd->cdh.num_words[1]); NTOHUL(cd->cdh.num_word_chars[0]); NTOHUL(cd->cdh.num_word_chars[1]); NTOHUL(cd->cdh.lookback); /* cfh */ for (i = 0; i <= 1; ++i) { int j; NTOHSI(cd->cfh[i]->hd.num_codes); NTOHSI(cd->cfh[i]->hd.mincodelen); NTOHSI(cd->cfh[i]->hd.maxcodelen); for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j) { NTOHSI(cd->cfh[i]->hd.lencount[j]); NTOHUL(cd->cfh[i]->hd.min_code[j]); } NTOHUL(cd->cfh[i]->uncompressed_size); for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j) NTOHUL(cd->cfh[i]->huff_words_size[j]); } NTOHUL(cd->MemForCompDict); /* ad */ if (cd->cdh.novel_method == MG_NOVEL_DELTA || cd->cdh.novel_method == MG_NOVEL_HYBRID) for (i = 0; i <= 1; ++i) { int j; NTOHUL(cd->ad->afh[i].num_frags); NTOHUL(cd->ad->afh[i].mem_for_frags); for (j = 0; j < 33; ++j) { NTOHSI(cd->ad->blk_start[i][j]); NTOHSI(cd->ad->blk_end[i][j]); } } NTOHSI(cd->fast_loaded); delete []fixup; // the whole fast comp dict is a bit of a hack so I don't // feel too bad about the next line :-) -- Rodger. _cd = *cd; return true; } static bool LoadCompDict (FILE *compDictFile, FILE *auxDictFile, FILE *fastCompDictFile, compression_dict &cd) { // see if we have a fast loading compression dictionary if (fastCompDictFile != NULL) return LoadFastCompDict (fastCompDictFile, cd); // slow compression dictionary return LoadSlowCompDict (compDictFile, auxDictFile, cd); } // try to open the dictionary files and load the dictionary static bool OpenLoadCompDict (char *textname, compression_dict &cd) { FILE *compDictFile = NULL; FILE *auxDictFile = NULL; FILE *fastCompDictFile = NULL; fastCompDictFile = open_file (textname, TEXT_DICT_FAST_SUFFIX, "rb", MAGIC_FAST_DICT, MG_CONTINUE); if (fastCompDictFile == NULL) { compDictFile = open_file (textname, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_MESSAGE); auxDictFile = open_file (textname, TEXT_DICT_AUX_SUFFIX, "rb", MAGIC_AUX_DICT, MG_CONTINUE); } bool res = LoadCompDict (compDictFile, auxDictFile, fastCompDictFile, cd); if (compDictFile != NULL) fclose (compDictFile); if (auxDictFile != NULL) fclose (auxDictFile); if (fastCompDictFile != NULL) fclose (fastCompDictFile); return res; } static bool LoadLevels (char *textname, FTextLevel &levels) { FILE *levelFile = NULL; // open the text level file levelFile = open_file (textname, TEXT_LEVEL_SUFFIX, "rb", MAGIC_TEXT_LEVELS, MG_CONTINUE); if (levelFile == NULL) return false; // seek to the appropriate place and read the level information bool res = ((fseek (levelFile, sizeof (u_long), SEEK_SET) == 0) && levels.Read (levelFile)); // close the file fclose (levelFile); return res; } TextData::TextData () { // put file pointers in known state first textFile = NULL; textIdxFile = NULL; Clear (); } void TextData::Clear () { cd.Clear(); textFile = NULL; textIdxFile = NULL; cth.Clear(); levels.Clear(); } bool TextData::LoadData (char *basepath, char *textname) { if (textname[0] == '\0') return false; // set the basepath set_basepath(basepath); // load the compression dictionary if (!OpenLoadCompDict (textname, cd)) return false; // open the compressed text and text index file textFile = open_file (textname, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_CONTINUE); if (textFile == NULL) return false; textIdxFile = open_file (textname, TEXT_IDX_SUFFIX, "rb", MAGIC_TEXI, MG_CONTINUE); if (textIdxFile == NULL) return false; // read in the compressed text header if ((fseek (textFile, sizeof (u_long), SEEK_SET) != 0) || !cth.Read (textFile)) return false; // read in the level information if (!LoadLevels (textname, levels)) return false; return true; } bool TextData::UnloadData () { // close any open files if (textFile != NULL) { fclose (textFile); textFile = NULL; } if (textIdxFile != NULL) { fclose (textIdxFile); textIdxFile = NULL; } // do general clear Clear (); return true; } bool GetDocIdx (TextData &td, const UCArray &docLevel, unsigned long docNum, TextIdx &docIdx) { // make sure the text index file was opened successfully if (td.textIdxFile == NULL) return false; // read in the index TextLevelInfo &levelInfo = td.levels.levelInfo[docLevel]; if (!docIdx.Read (td.textIdxFile, levelInfo, docNum)) return false; return true; } #define MY_HUFF_DECODE(len, code, mcodes) \ do { \ register unsigned long *__min_code = (mcodes); \ register unsigned long *__mclen = __min_code; \ register unsigned long __code = 0; \ do \ { \ __code += __code + buffer.bit(); \ } \ while (__code < *++__mclen); \ (len) = __mclen - __min_code; \ (code) = __code - *__mclen; \ } while(0); bool GetDocText (TextData &td, const UCArray &docLevel, unsigned long docNum, UCArray &docText) { // erase the current text docText.erase (docText.begin(), docText.end()); // look up the information about this document TextIdx docIdx; if (!GetDocIdx (td, docLevel, docNum, docIdx)) return false; // do seek to appropriate position stdio_bitio_buffer buffer (td.textFile); buffer.seek (docIdx.start.byte, docIdx.start.bit); // decompress the document compression_dict &cd = td.cd; auxiliary_dict *ad = cd.ad; int which = docIdx.which; unsigned long num_bits = (docIdx.end.byte*8+(8-docIdx.end.bit)) - (docIdx.start.byte*8+(8-docIdx.start.bit)); unsigned long bits = 0; if (docText.capacity() < docText.size() + num_bits + 1) { docText.reserve(docText.size() + num_bits + 1); } // keep decoding bits until enough bits have been decoded while (bits < num_bits) { register unsigned code, len; register int r; register u_char *t, *b = NULL; u_char word[MAXWORDLEN + 1]; if (cd.cfh[which]) { MY_HUFF_DECODE (len, code, cd.cfh[which]->hd.min_code); bits += len; r = code & ((1 << cd.cdh.lookback) - 1); t = cd.values[which][len][code >> cd.cdh.lookback]; /* step through from base pointer */ b = word + 1; while (r--) { register int copy = *t >> 4; memcpy (word + copy + 1, t + 1, *t & 0xf); word[0] = copy + (*t & 0xf); t += ((*t) & 0xf) + 1; } } else t = NULL; if (t == cd.escape[which]) { switch (cd.cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: { int len, i; int c; len = buffer.huff_decode(cd.lens_huff[which]->min_code, cd.lens_vals[which], &bits); for (i = 0; i < len; ++i) { c = buffer.huff_decode(cd.chars_huff[which]->min_code, cd.chars_vals[which], &bits); docText.push_back (c); } } break; case MG_NOVEL_DELTA: case MG_NOVEL_HYBRID: { int idx = 0, len; u_char *base; switch (cd.cdh.novel_method) { case MG_NOVEL_DELTA: { idx = buffer.delta_decode (&bits); --idx; } break; case MG_NOVEL_HYBRID: { int k; k = buffer.gamma_decode (&bits); --k; idx = buffer.binary_decode(ad->blk_end[which][k] - ad->blk_start[which][k] + 1, &bits); idx += ad->blk_start[which][k] - 1; } break; } base = ad->words[which][idx]; len = *base++; for (; len; --len) { docText.push_back (*base++); } } break; } } else { /* copy over the matching prefix */ r = (*t >> 4); while (r--) { docText.push_back (*b++); } /* and the stored suffix */ r = ((*t) & 0xf); while (r--) { docText.push_back (*++t); } } which = !which; } buffer.done(); return true; }