/************************************************************************** * * text.pass2.cpp -- Text compression (Pass 2) * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #if defined(WIN32) || defined(_WIN32) || defined(__WIN32__) #pragma warning(disable:4786) #endif // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif #if defined (__WIN32__) # include # define unlink _unlink #endif #include "sysfuncs.h" #include "memlib.h" #include "messages.h" #include "local_strings.h" #include "bitio_m_stdio.h" #include "huffman.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "mg.h" #include "mg_files.h" #include "build.h" #include "words.h" #include "text.h" #include "hash.h" #include "locallib.h" #include "comp_dict.h" #include "FText.h" #define POOL_SIZE 1024*256 struct char_pool { struct char_pool *next; u_long left; u_char *ptr; u_char pool[POOL_SIZE]; }; struct novel_hash_rec { u_long ordinal_num; u_char *word; }; #define INITIAL_HASH_SIZE 7927 struct novel_hash_table { novel_hash_rec *HashTable; u_long HashSize, HashUsed; char_pool *first_pool; char_pool *pool; u_long next_num, binary_start; novel_hash_rec **code_to_nhr; }; static FILE *text; static stdio_bitio_buffer textOutBuf; static u_long headLen; static novel_hash_table nht[2]; int blk_start[2][33], blk_end[2][33]; // 0 = non-word, 1 = word unsigned char whichWordType = 0; CompressTextInfoMap textInfoMap; static char_pool *new_pool (char_pool * pool) { char_pool *p = (char_pool *) Xmalloc (sizeof (char_pool)); if (!p) FatalError (1, "Unable to allocate memory for pool"); if (pool) pool->next = p; p->next = NULL; p->left = POOL_SIZE; p->ptr = p->pool; return p; } int init_text_2 (const TagInfo &/*tagInfo*/, char *fileName) { char path[512]; // load the compression dictionary if (LoadCompressionDictionary (make_name (fileName, TEXT_DICT_SUFFIX, path)) == COMPERROR) return COMPERROR; // create the compressed text file and attach it to the text buffer if (!(text = create_file (fileName, TEXT_SUFFIX, "w+b", MAGIC_TEXT, MG_MESSAGE))) return COMPERROR; textOutBuf.attachFile (text); textOutBuf.encodeStart (); // write out the compressed text header memset (&cth, '\0', sizeof (cth)); if (fwrite (&cth, sizeof (cth), 1, text) != 1) return COMPERROR; // the length of the compressed header is the size of the // magic number plus the size of the compressed text header headLen = sizeof (u_long) + sizeof (cth); // start off with non-word whichWordType = 0; // clear the text pointers textInfoMap.erase (textInfoMap.begin(), textInfoMap.end()); // set up the hash table of novel words int i; if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS) for (i = 0; i <= 1; ++i) { nht[i].HashSize = INITIAL_HASH_SIZE; nht[i].HashTable = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize); memset (nht[i].HashTable, '\0', sizeof (novel_hash_rec) * nht[i].HashSize); nht[i].HashUsed = 0; nht[i].HashSize = INITIAL_HASH_SIZE; nht[i].pool = nht[i].first_pool = new_pool (NULL); nht[i].next_num = 1; nht[i].binary_start = 1; nht[i].code_to_nhr = NULL; if (cdh.novel_method == MG_NOVEL_HYBRID) { int num; num = 1; blk_start[i][0] = 0; blk_end[i][0] = cdh.num_words[i] - 1; while (num < 33) { blk_start[i][num] = blk_end[i][num - 1] + 1; blk_end[i][num] = blk_start[i][num] + (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2; ++num; } } } return (COMPALLOK); } static int compress_text (bitio_buffer &buffer, const u_char *s_in, int l_in, unsigned char &which, unsigned long &numWords) { const u_char *end = s_in + l_in - 1; for (; s_in <= end; which = !which) { u_char Word[MAXWORDLEN + 1]; int res; if (which) ++numWords; /* First parse a word or non-word out of the string */ if (which) PARSE_WORD (Word, s_in, end); else PARSE_NON_WORD (Word, s_in, end); /* Search the hash table for Word */ if (ht[which]) { register unsigned long hashval, step; register int tsize = ht[which]->size; register u_char **wptr; HASH (hashval, step, Word, tsize); for (;;) { register u_char *s1; register u_char *s2; register int len; wptr = ht[which]->table[hashval]; if (wptr == NULL) { res = COMPERROR; break; } /* Compare the words */ s1 = Word; s2 = *wptr; len = *s1 + 1; for (; len; --len) if (*s1++ != *s2++) break; if (len) { hashval += step; if (hashval >= (unsigned long)tsize) hashval -= tsize; } else { res = ht[which]->table[hashval] - ht[which]->words; break; } } } else res = COMPERROR; /* Check that the word was found in the dictionary */ if (res == COMPERROR) { if (cdh.dict_type == MG_COMPLETE_DICTIONARY) { Message ("Unknown word \"%.*s\"\n", *Word, Word + 1); return (COMPERROR); } if (cdh.dict_type == MG_PARTIAL_DICTIONARY) { u_long i; if (ht[which]) { res = ht[which]->hd->num_codes - 1; buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL); } buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL); for (i = 0; i < Word[0]; ++i) buffer.huff_encode (Word[i + 1], char_codes[which], char_huff[which].clens, NULL); } if (cdh.dict_type == MG_SEED_DICTIONARY) { if (ht[which]) { res = ht[which]->hd->num_codes - 1; buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL); } switch (cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: { u_long i; buffer.huff_encode (Word[0], lens_codes[which], lens_huff[which].clens, NULL); for (i = 0; i < Word[0]; ++i) buffer.huff_encode (Word[i + 1], char_codes[which], char_huff[which].clens, NULL); } break; case MG_NOVEL_DELTA: case MG_NOVEL_HYBRID: { register unsigned long hashval, step; register novel_hash_table *h = &nht[which]; register int hsize = h->HashSize; register novel_hash_rec *ent; HASH (hashval, step, Word, hsize); for (;;) { register u_char *s1, *s2; register int len; ent = h->HashTable + hashval; if (!ent->word) { int len = *Word + 1; if ((unsigned)len > h->pool->left) h->pool = new_pool (h->pool); ent->word = h->pool->ptr; ent->ordinal_num = h->next_num++; memcpy (h->pool->ptr, Word, len); h->pool->ptr += len; h->pool->left -= len; ++h->HashUsed; break; } /* Compare the words */ s1 = Word; s2 = ent->word; len = *s1 + 1; for (; len; --len) if (*s1++ != *s2++) break; if (!len) break; hashval = (hashval + step); if (hashval >= (unsigned)hsize) hashval -= hsize; } switch (cdh.novel_method) { case MG_NOVEL_DELTA: buffer.delta_encode (ent->ordinal_num, NULL); break; case MG_NOVEL_HYBRID: int k = 0; int j = ent->ordinal_num - 1; while (j > blk_end[which][k]) ++k; assert (j - blk_start[which][k] + 1 >= 1 && j - blk_start[which][k] + 1 <= blk_end[which][k] - blk_start[which][k] + 1); buffer.gamma_encode (k + 1, NULL); buffer.binary_encode (j - blk_start[which][k] + 1, blk_end[which][k] - blk_start[which][k] + 1, NULL); break; } if (h->HashUsed >= h->HashSize >> 1) { novel_hash_rec *ht; unsigned long size; unsigned long i; size = prime (h->HashSize * 2); if (!(ht = (novel_hash_rec *) Xmalloc (sizeof (novel_hash_rec) * size))) { Message ("Unable to allocate memory for table"); return (COMPERROR); } memset (ht, '\0', sizeof (novel_hash_rec) * size); for (i = 0; i < h->HashSize; ++i) if (h->HashTable[i].word) { register u_char *wptr; register unsigned long hashval, step; wptr = h->HashTable[i].word; HASH (hashval, step, wptr, size); wptr = (ht + hashval)->word; while (wptr) { hashval += step; if (hashval >= size) hashval -= size; wptr = (ht + hashval)->word; } ht[hashval] = h->HashTable[i]; } Xfree (h->HashTable); h->HashTable = ht; h->HashSize = size; } } break; } } } else { buffer.huff_encode (res, ht[which]->codes, ht[which]->hd->clens, NULL); } } return COMPALLOK; } int process_text_2 (const TagInfo &tagInfo, const TextElArray &doc) { // we need to remember where the last text element ends unsigned char endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit unsigned long endPos = textOutBuf.GetBytesWritten () + headLen; // process each text element TextElArray::const_iterator here = doc.begin(); TextElArray::const_iterator end = doc.end(); while (here != end) { // remember current place in compressed text unsigned char startWhich = whichWordType; unsigned char startBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit unsigned long startPos = textOutBuf.GetBytesWritten () + headLen; // compress the text if (compress_text (textOutBuf, &(here->text[0]), (*here).text.size(), whichWordType, cth.num_of_words) != COMPALLOK) return COMPERROR; endBit = textOutBuf.GetBtg (); // 8=high bit, 1=low bit endPos = textOutBuf.GetBytesWritten () + headLen; // update the pointers if this is a level tag if (tagInfo.levelTags.find ((*here).tagName) != tagInfo.levelTags.end()) { if ((*here).elType == OpenTagE) { // set the start of the document textInfoMap[(*here).tagName].SetStart (startPos, startBit, startWhich); } else if ((*here).elType == CloseTagE) { // set the end of the document textInfoMap[(*here).tagName].SetEnd (endPos, endBit); } else { // TextE, nothing needs to be done } } cth.num_of_bytes += (*here).text.size(); ++here; } // close off any unclosed tags CompressTextInfoMap::iterator tiHere = textInfoMap.begin (); CompressTextInfoMap::iterator tiEnd = textInfoMap.end (); while (tiHere != tiEnd) { if ((*tiHere).second.inDoc) (*tiHere).second.SetEnd (endPos, endBit); ++tiHere; } // we've processed one more document ++cth.num_of_docs; return COMPALLOK; } static int write_aux_dict (char *fileName) { int i; FILE *aux; if (!(aux = create_file (fileName, TEXT_DICT_AUX_SUFFIX, "wb", MAGIC_AUX_DICT, MG_MESSAGE))) return COMPERROR; for (i = 0; i <= 1; ++i) { aux_frags_header afh; char_pool *cp; afh.num_frags = nht[i].HashUsed; afh.mem_for_frags = 0; for (cp = nht[i].first_pool; cp; cp = cp->next) afh.mem_for_frags += POOL_SIZE - cp->left; /* [RPAP - Jan 97: Endian Ordering] */ HTONUL(afh.num_frags); HTONUL(afh.mem_for_frags); fwrite (&afh, sizeof (afh), 1, aux); for (cp = nht[i].first_pool; cp; cp = cp->next) fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux); } fclose (aux); return COMPALLOK; } static void estimate_compressed_aux_dict (void) { int i; u_long aux_compressed = 0, total_uncomp = 0; for (i = 0; i <= 1; ++i) { int j; long chars[256], fchars[256]; long lens[16], flens[16]; char_pool *cp; memset (chars, '\0', sizeof (chars)); memset (lens, '\0', sizeof (lens)); for (cp = nht[i].first_pool; cp; cp = cp->next) { u_char *buf = cp->pool; while (buf != cp->ptr) { int len = *buf++; ++lens[len]; total_uncomp += len + 4; for (; len; --len) ++chars[*buf++]; } } for (j = 0; j < 256; ++j) if (!chars[j] && PESINAWORD (j) == i) fchars[j] = 1; else fchars[j] = chars[j]; for (j = 0; j < 16; ++j) if (!lens[j]) flens[j] = 1; else flens[j] = lens[j]; aux_compressed += (long unsigned int) ((Calculate_Huffman_Size (16, flens, lens) + Calculate_Huffman_Size (256, fchars, chars)) / 8); } } static bool WriteTextIdx (char *fileName, FTextLevel &textLevels) { // create the text index FILE *textIdxFile; if (!(textIdxFile = create_file (fileName, TEXT_IDX_SUFFIX, "w+b", MAGIC_TEXI, MG_MESSAGE))) return false; // output each level CompressTextInfoMap::iterator tiHere = textInfoMap.begin (); CompressTextInfoMap::iterator tiEnd = textInfoMap.end (); while (tiHere != tiEnd) { // remember the level information TextLevelInfo levelInfo; levelInfo.levelTag = (*tiHere).first; levelInfo.textIdxPtr = ftell (textIdxFile); levelInfo.numEntries = (*tiHere).second.docPtrs.size(); textLevels.levelInfo[levelInfo.levelTag] = levelInfo; // write out the index array for this level if (!WriteTextIdxArray (textIdxFile, (*tiHere).second.docPtrs)) return false; ++tiHere; } // close the file fclose (textIdxFile); return true; } bool WriteTextLevels (char *fileName, const FTextLevel &textLevels) { // cout << textLevels; // create the text index FILE *textLevelFile; if (!(textLevelFile = create_file (fileName, TEXT_LEVEL_SUFFIX, "w+b", MAGIC_TEXT_LEVELS, MG_MESSAGE))) return false; // output each level if (!textLevels.Write (textLevelFile)) return false; // close the file fclose (textLevelFile); return true; } int done_text_2 (const TagInfo &/*tagInfo*/, char *fileName) { // write out any remaining bits textOutBuf.encodeDone(); // write out the compressed text header (with updated statistics) if (fseek (text, sizeof (u_long), SEEK_SET) == -1 || !cth.Write (text)) return COMPERROR; fclose (text); // write out the text index file and get the level information FTextLevel textLevels; if (!WriteTextIdx (fileName, textLevels)) return COMPERROR; // write out the text level information if (!WriteTextLevels (fileName, textLevels)) return COMPERROR; // write out the auxiliary dictionary if (cdh.dict_type != MG_COMPLETE_DICTIONARY && (cdh.novel_method == MG_NOVEL_DELTA || cdh.novel_method == MG_NOVEL_HYBRID)) { if (write_aux_dict (fileName) == COMPERROR) return COMPERROR; estimate_compressed_aux_dict (); } else { unlink (make_name (fileName, TEXT_DICT_AUX_SUFFIX, NULL)); } return (COMPALLOK); }