/************************************************************************** * * text.h -- Header file for compression related stuff * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: text.h 856 2000-01-14 02:26:25Z sjboddie $ * **************************************************************************/ #ifndef H_TEXT #define H_TEXT #include "huffman.h" #include /***************************************************************************** * * There are several different methods of compressing the text of the * database. The following defines list the different methods of text * compression * */ /* The dictionary contains all the fragments that occur in the collection i.e. escapes are not possible */ #define MG_COMPLETE_DICTIONARY 0 /* Certain words have been deleted from the dictionary. The words deleted have been used to create the frequency huffman codes of the characters. This dictionary has an escape code and may be used to compress novel words. This dictionary may fail if there is a novel character. */ #define MG_PARTIAL_DICTIONARY 1 /* This dictionary has an escape so that novel words and non-words can be coded. The method for coding the novel words and non-words is determined by a dictionary parameter. */ #define MG_SEED_DICTIONARY 2 /***************************************************************************** * * With a seed dictionary there are several methods for coding the novel * words and non-words the following defined values specify the different * methods of coding. * */ /* Code novel words and non-words character by character using huffman codes. The huffman codes for the word and non-word lengths and characters are generated from the distribution of lengths and characters in the dictionary. */ #define MG_NOVEL_HUFFMAN_CHARS 0 /* This method codes novel words using delta codes. The novel words are stored in a auxillary dictionary which is built by pass two. */ #define MG_NOVEL_DELTA 2 /* This method codes novel words using hybrid version of delta. The novel words are stored in a auxillary dictionary which is built by pass two. */ #define MG_NOVEL_HYBRID 3 /* This specified an amount of extra space allocated in the compression_dict_header for adding new parameters. As new parameters are added this should be decreased. */ #define TEXT_PARAMS 15 struct compression_dict_header { u_long dict_type; u_long novel_method; u_long params[TEXT_PARAMS]; u_long num_words[2]; u_long num_word_chars[2]; u_long lookback; }; struct comp_frags_header { huff_data hd; u_long uncompressed_size; u_long huff_words_size[MAX_HUFFCODE_LEN + 1]; }; // BOGUSTEXTLEN is used to replace the ratio // in the compressed_text_header while preparing // for UCArray #define BOGUSTEXTLEN 1000000 struct compressed_text_header { u_long num_of_docs; u_long num_of_words; // number of words in collection double num_of_bytes; compressed_text_header (); void Clear (); // you must seek to the appropriate place before calling // Read or Write bool Read (FILE *f); bool Write (FILE *f) const; }; struct compression_stats_header { u_long num_docs; u_long dummy; // alignment double num_bytes; }; struct frags_stats_header { u_long num_frags; u_long mem_for_frags; }; struct aux_frags_header { u_long num_frags; u_long mem_for_frags; }; struct auxiliary_dict { aux_frags_header afh[2]; u_char *word_data[2]; u_char **words[2]; int blk_start[2][33], blk_end[2][33]; /* blk_start and blk_end are required for the hybrid methods */ auxiliary_dict (); ~auxiliary_dict (); void Clear (); }; struct compression_dict { compression_dict_header cdh; comp_frags_header *cfh[2]; unsigned long MemForCompDict; u_char ***values[2]; u_char *escape[2]; huff_data *chars_huff[2]; u_long **chars_vals[2]; huff_data *lens_huff[2]; u_long **lens_vals[2]; auxiliary_dict *ad; int fast_loaded; compression_dict (); ~compression_dict (); void Clear (); }; #endif