/**************************************************************************
 *
 * mg_compression_dict.c -- Routines for reading the compression dictionary
 * Copyright (C) 1994  Neil Sharman
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_compression_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $
 *
 **************************************************************************/

#include "sysfuncs.h"

#include "memlib.h"
#include "messages.h"
#include "local_strings.h"
#include "bitio_gen.h"
#include "bitio_m.h"
#include "bitio_m_stdio.h"
#include "timing.h"
#include "mgheap.h"
#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */

#include "mg_files.h"
#include "locallib.h"
#include "invf.h"
#include "text.h"
#include "words.h"
#include "mg.h"

#define MAXBITS (sizeof(unsigned long) * 8)

/*
   $Log$
   Revision 1.1  2003/02/20 21:18:24  mdewsnip
   Addition of MG package for search and retrieval

   Revision 1.2  1999/12/10 15:04:25  cs025
   Incorrect use of NTOHUL on a double value (num_bytes).

   Revision 1.1  1999/08/10 21:18:04  sjboddie
   renamed mg-1.3d directory mg

   Revision 1.4  1999/01/15 03:05:51  rjmcnab

   Renamed lib/heap to be lib/mgheap (it was causing some problems with
   some versions of the STL libraries which contained a heap.h)

   Revision 1.3  1998/12/17 09:12:52  rjmcnab

   Altered mg to process utf-8 encoded Unicode. The main changes
   are in the parsing of the input, the casefolding, and the stemming.

   Revision 1.2  1998/11/25 07:55:44  rjmcnab

   Modified mg to that you can specify the stemmer you want
   to use via a command line option. You specify it to
   mg_passes during the build process. The number of the
   stemmer that you used is stored within the inverted
   dictionary header and the stemmed dictionary header so
   the correct stemmer is used in later stages of building
   and querying.

   Revision 1.1  1998/11/17 09:34:52  rjmcnab
   *** empty log message ***

   * Revision 1.3  1994/10/20  03:56:54  tes
   * I have rewritten the boolean query optimiser and abstracted out the
   * components of the boolean query.
   *
   * Revision 1.2  1994/09/20  04:41:45  tes
   * For version 1.1
   *
 */

static char *RCSID = "$Id: mg_compression_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $";

/* #define DEBUG1 */

/* #define DEBUG */

#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))

#define MAX_RECALCULATIONS 100

typedef struct WordData
  {
    u_long freq, occur_num;
    float saving;
    float char_bit_cost;
    u_long num_trans;
    u_char *word;
  }
WordData;

static WordData *Words[2];
static u_long Num[2];
static u_long chars[2];

#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)


typedef struct DictData
  {
    WordData **wd;
    u_long num_wds;
    u_long chars;
  }
DictData;

typedef DictData DictInfo[2];

static DictInfo keep, discard, all;
static compression_stats_header csh;

static char *file_name = "";
static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;


static void ReadInWords (char *);
static void Select_on (int k, heap_comp hc);
static void Method3 (int k);
static void Select_all (void);
static u_long WriteOutWords (char *, u_long, int);
static int DecFreqIncWL (void *a, void *b);
static int OccuranceOrder (void *a, void *b);


int main (int argc, char **argv)
{
  ProgTime StartTime;
  int ch;
  char type = 'C';
  char mode = '0';
  int lookback = 2;
  double k = 0;
  u_long mem_reqd;
  opterr = 0;
  msg_prefix = argv[0];
  while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HBDYM")) != -1)
    switch (ch)
      {
      case 'H':
	novel_method = MG_NOVEL_HUFFMAN_CHARS;
	break;
      case 'B':
	novel_method = MG_NOVEL_BINARY;
	break;
      case 'D':
	novel_method = MG_NOVEL_DELTA;
	break;
      case 'Y':
	novel_method = MG_NOVEL_HYBRID;
	break;
      case 'M':
	novel_method = MG_NOVEL_HYBRID_MTF;
	break;
      case 'f':		/* input file */
	file_name = optarg;
	break;
      case 'd':
	set_basepath (optarg);
	break;
      case 'C':
      case 'P':
      case 'S':
	type = ch;
	break;
      case '0':
      case '1':
      case '2':
      case '3':
	mode = ch;
	break;
      case 'l':
	lookback = atoi (optarg);
	if (!is_power_of_two (lookback))
	  FatalError (1, "The lookback value must be a power of 2");
	lookback = floorlog_2 (lookback);
	break;
      case 'k':
	k = atof (optarg) * 1024;
	break;
      case 'h':
      case '?':
	fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
	"[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
		 argv[0]);
	exit (1);
      }
  GetTime (&StartTime);

  ReadInWords (file_name);

  if (type == 'C')
    {
      Select_all ();
      mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
    }
  else
    {
      switch (mode)
	{
	case '0':
	  Select_all ();
	  break;
	case '1':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Select_on ((int) k, OccuranceOrder);
	  break;
	case '2':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Select_on ((int) k, DecFreqIncWL);
	  break;
	case '3':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Method3 ((int) k);
	  break;
	}
      if (type == 'P')
	{
	  mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
	}
      else
	{
	  mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
	}
    }

  Message ("Num words           : %8u -> %8u\n", Num[1], keep[1].num_wds);
  Message ("Num non-words       : %8u -> %8u\n", Num[0], keep[0].num_wds);
  Message ("Chars of words      : %8u -> %8u\n", chars[1], keep[1].chars);
  Message ("Chars of non-words  : %8u -> %8u\n", chars[0], keep[0].chars);
  Message ("Mem usage           : %8u -> %8u\n",
	   (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
	     (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
	   keep[0].chars + keep[1].chars);
  Message ("Actual mem required : %8u\n", mem_reqd);

  Message ("%s", ElapsedTime (&StartTime, NULL));
  return 0;
}


static void 
ReadInWords (char *filename)
{
  FILE *f;
  unsigned long i;
  f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
		 MAGIC_STATS_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  fread (&csh, sizeof (csh), 1, f);

  /* [RPAP - Jan 97: Endian Ordering] */
  NTOHUL(csh.num_docs);
  NTOHD(csh.num_bytes);

  for (i = 0; i < 2; i++)
    {
      frags_stats_header fsh;
      char *buf;
      WordData *wd;
      int j;
      chars[i] = 0;

      fread (&fsh, sizeof (fsh), 1, f);
      /* [RPAP - Jan 97: Endian Ordering] */
      NTOHUL(fsh.num_frags);
      NTOHUL(fsh.mem_for_frags);

      Num[i] = all[i].num_wds = fsh.num_frags;

      /* The +1 on the following line is to leave room for the esc code. */
      all[i].wd = Xmalloc (sizeof (WordData *) * Num[i] + 1);

      buf = Xmalloc (fsh.mem_for_frags);
      wd = Words[i] = Xmalloc (sizeof (WordData) * Num[i]);
      for (j = 0; j < Num[i]; j++, wd++)
	{
	  int len;
	  fread (&wd->freq, sizeof (wd->freq), 1, f);
	  NTOHUL(wd->freq);  /* [RPAP - Jan 97: Endian Ordering] */
	  fread (&wd->occur_num, sizeof (wd->occur_num), 1, f);
	  NTOHUL(wd->occur_num);  /* [RPAP - Jan 97: Endian Ordering] */
	  len = fgetc (f);
	  wd->word = (u_char *) buf;
	  *buf++ = len;
	  fread (buf, len, 1, f);
	  buf += len;
	  all[i].wd[j] = wd;
	}
      chars[i] = fsh.mem_for_frags - fsh.num_frags;
    }
  fclose (f);
}


static void 
Alloc_keep_discard (void)
{
  keep[0].num_wds = 0;
  keep[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
  keep[1].num_wds = 0;
  keep[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
  discard[0].num_wds = 0;
  discard[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
  discard[1].num_wds = 0;
  discard[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
}


static int 
sort_comp (const void *a, const void *b)
{
  WordData *aa = *(WordData **) a;
  WordData *bb = *(WordData **) b;
  return casecompare (aa->word, bb->word);  /* [RPAP - Jan 97: Stem Index Change] */
}


static void 
SortAndCount_DictData (DictData * dd)
{
  int i;
  WordData **wd;
  qsort (dd->wd, dd->num_wds, sizeof (WordData *), sort_comp);
  dd->chars = 0;
  wd = dd->wd;
  for (i = 0; i < dd->num_wds; i++, wd++)
    dd->chars += (*wd)->word[0];
}


static void 
Select_all (void)
{
  int i;
  Alloc_keep_discard ();
  keep[0].num_wds = Num[0];
  for (i = 0; i < Num[0]; i++)
    keep[0].wd[i] = Words[0] + i;
  keep[1].num_wds = Num[1];
  for (i = 0; i < Num[1]; i++)
    keep[1].wd[i] = Words[1] + i;
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
}


static int 
DecFreqIncWL (void *a, void *b)
{
  WordData *aa = *(WordData **) a;
  WordData *bb = *(WordData **) b;
  if (aa->freq > bb->freq)
    return -1;
  else if (aa->freq < bb->freq)
    return 1;
  else
    return bb->word[0] - aa->word[0];
}


static int 
OccuranceOrder (void *a, void *b)
{
  WordData *aa = *(WordData **) a;
  WordData *bb = *(WordData **) b;
  if (aa->occur_num > bb->occur_num)
    return 1;
  else if (aa->occur_num < bb->occur_num)
    return -1;
  else
    return 0;
}


static void 
Select_on (int k, heap_comp hc)
{
  int i, num, mem;
  WordData **wd;

  Alloc_keep_discard ();

  num = Num[0] + Num[1];
  wd = Xmalloc (num * sizeof (WordData *));
  for (i = 0; i < Num[0]; i++)
    wd[i] = Words[0] + i;
  for (i = 0; i < Num[1]; i++)
    wd[i + Num[0]] = Words[1] + i;

  heap_build (wd, sizeof (*wd), num, hc);

  mem = 0;
  while (k > mem && num)
    {
      int idx;
      WordData *word = wd[0];
#ifdef DEBUG
      fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
	       keep[0].num_wds, keep[1].num_wds,
	       mem, word->freq, word->occur_num, word2str (word->word));
#endif
      mem += sizeof (u_char *) + word->word[0];
      heap_deletehead (wd, sizeof (*wd), &num, hc);
      idx = KIND (word);
      keep[idx].wd[keep[idx].num_wds++] = word;
    }

  for (i = 0; i < num; i++)
    {
      WordData *word = wd[i];
      int idx = KIND (word);
      discard[idx].wd[discard[idx].num_wds++] = word;
    }
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
  SortAndCount_DictData (&discard[0]);
  SortAndCount_DictData (&discard[1]);

  assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
  assert (keep[1].num_wds + discard[1].num_wds == Num[1]);

}


static int 
BigSaving (void *a, void *b)
{
  WordData *aa = *(WordData **) a;
  WordData *bb = *(WordData **) b;
  return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
}

static int 
SmallSaving (void *a, void *b)
{
  WordData *aa = *(WordData **) a;
  WordData *bb = *(WordData **) b;
  return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
}


static void 
CalcCharCounts (WordData ** wd, int num,
		char *char_lens[2], char *len_lens[2],
		u_long escape[2])
{
  long char_freqs[2][256];
  long len_freqs[2][16];
  huff_data hd;
  int i;
  escape[0] = 0;
  escape[1] = 0;
  bzero ((char *) char_freqs, sizeof (char_freqs));
  bzero ((char *) len_freqs, sizeof (len_freqs));
  for (i = 0; i < num; i++, wd++)
    {
      u_long freq = (*wd)->freq;
      u_char *buf = (*wd)->word;
      int len = *buf++;
      int idx = KIND (*wd);
      len_freqs[idx][len] += freq;
      escape[idx] += freq;
      for (; len; len--, buf++)
	char_freqs[idx][(u_long) (*buf)] += freq;
    }
  Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
  char_lens[0] = hd.clens;
  Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
  char_lens[1] = hd.clens;

  Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
  len_lens[0] = hd.clens;
  Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
  len_lens[1] = hd.clens;
}


void 
CalcBitCost (WordData ** discard_word, int discard_num,
	     WordData ** keep_word, int keep_num, double freqs_trans[2],
	     u_long escape[2], int num_trans)
{
  char *char_lens[2];
  char *len_lens[2];
  double esc[2];
  int j;
  CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
  esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
  esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
  for (j = 0; j < discard_num; j++, discard_word++)
    {
      float cbc, wbc;
      u_char *buf = (*discard_word)->word;
      int len = *buf++;
      u_long freq = (*discard_word)->freq;
      int idx = KIND (*discard_word);

      cbc = len_lens[idx][len];
      for (; len; len--, buf++)
	cbc += char_lens[idx][(u_long) (*buf)];

      (*discard_word)->char_bit_cost = (cbc + esc[idx]) * freq;

      wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;

      (*discard_word)->saving = ((*discard_word)->char_bit_cost - wbc) /
	(sizeof (char *) + (*discard_word)->word[0]);

      (*discard_word)->num_trans = num_trans;
    }
  for (j = 0; j < keep_num; j++, keep_word++)
    {
      float cbc, wbc;
      u_char *buf = (*keep_word)->word;
      int len = *buf++;
      u_long freq = (*keep_word)->freq;
      int idx = KIND (*keep_word);

      cbc = len_lens[idx][len];
      for (; len; len--, buf++)
	cbc += char_lens[idx][(u_long) (*buf)];

      (*keep_word)->char_bit_cost = (cbc + esc[idx]) * freq;

      wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;

      (*keep_word)->saving = ((*keep_word)->char_bit_cost - wbc) /
	(sizeof (char *) + (*keep_word)->word[0]);

      (*keep_word)->num_trans = num_trans;
    }
  Xfree (char_lens[0]);
  Xfree (char_lens[1]);
  Xfree (len_lens[0]);
  Xfree (len_lens[1]);
}


static void 
Method3 (int k)
{
  int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
  int keep_to_discard = 0;
  int discard_to_keep = 0;
  int recalcs = 0;
  double freqs_trans[2], total;
  u_long escape[2];
  WordData **keep_heap, **discard_heap;

  freqs_trans[0] = freqs_trans[1] = 0;
  num_trans = 0;

  discard_num = Num[0] + Num[1];
  discard_heap = Xmalloc (discard_num * sizeof (WordData *));

  keep_num = 0;
  keep_heap = Xmalloc (discard_num * sizeof (WordData *));


  for (i = 0; i < Num[0]; i++)
    discard_heap[i] = Words[0] + i;
  for (i = 0; i < Num[1]; i++)
    discard_heap[i + Num[0]] = Words[1] + i;

  heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);

  mem = 0;
  while (k > mem && discard_num)
    {
      WordData *word = discard_heap[0];
      mem += sizeof (u_char *) + word->word[0];
      heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
      keep_heap[keep_num++] = word;
      freqs_trans[KIND (word)] += word->freq;
      num_trans++;
    }

  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
	       freqs_trans, escape, num_trans);
  heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
  heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);

  total = 0;
  recalc_reqd = 0;
  while (keep_num && discard_num)
    {
      if ((keep_num && keep_heap[0]->num_trans < num_trans) ||
	  (discard_num && discard_heap[0]->num_trans < num_trans))
	{
	  if (keep_num && keep_heap[0]->num_trans < num_trans)
	    {
	      WordData *word = keep_heap[0];
	      int idx = KIND (word);
	      float wbc;
#ifdef DEBUG1
	      fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
		       word->saving);
#endif
	      wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
		word->freq;
	      word->saving = (word->char_bit_cost - wbc) /
		(sizeof (char *) + word->word[0]);
#ifdef DEBUG1
	      fprintf (stderr, " %.2f\n", word->saving);
#endif
	      word->num_trans = num_trans;
	      heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
	    }

	  if (discard_num && discard_heap[0]->num_trans < num_trans)
	    {
	      WordData *word = discard_heap[0];
	      int idx = KIND (word);
	      float wbc;
#ifdef DEBUG1
	      fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
		       word->saving);
#endif
	      wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
		word->freq;
	      word->saving = (word->char_bit_cost - wbc) /
		(sizeof (char *) + word->word[0]);
#ifdef DEBUG1
	      fprintf (stderr, " %.2f\n", word->saving);
#endif
	      word->num_trans = num_trans;
	      heap_changedhead (discard_heap, sizeof (word),
				discard_num, BigSaving);
	    }
	}
      else if (keep_heap[0]->saving < discard_heap[0]->saving)
	{
	  assert (keep_num && discard_num);
	  if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
	    {
	      /* Transfer the word at the top of the keep heap to the
	         discard heap. */
	      WordData *word = keep_heap[0];
	      int idx = KIND (word);
	      heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
	      discard_heap[discard_num] = word;
	      heap_additem (discard_heap, sizeof (word), &discard_num,
			    BigSaving);
	      freqs_trans[idx] -= word->freq;
	      mem -= sizeof (u_char *) + word->word[0];
	      num_trans++;
	      total += word->saving;
	      keep_to_discard++;
#ifdef DEBUG
	      fprintf (stderr,
		       "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
		       mem, word->freq, word->char_bit_cost,
		       word->saving, word2str (word->word));
#endif
	    }
	  else
	    {
	      /* Transfer the word at the top of the discard heap to the
	         keep heap. */
	      WordData *word = discard_heap[0];
	      int idx = KIND (word);
	      heap_deletehead (discard_heap, sizeof (word),
			       &discard_num, BigSaving);
	      keep_heap[keep_num] = word;
	      heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
	      freqs_trans[idx] += word->freq;
	      mem += sizeof (u_char *) + word->word[0];
	      num_trans++;
	      total += word->saving;
	      discard_to_keep++;
#ifdef DEBUG
	      fprintf (stderr,
		       "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
		       mem, word->freq, word->char_bit_cost,
		       word->saving, word2str (word->word));
#endif
	    }

	  recalc_reqd = 1;

	}
      else
	{
	  if (recalc_reqd == 0)
	    break;
#ifdef DEBUG1
	  fprintf (stderr, "--------------\n");
#endif
	  if (recalcs == MAX_RECALCULATIONS)
	    break;
	  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
		       freqs_trans, escape, num_trans);
	  heap_build (discard_heap, sizeof (*discard_heap),
		      discard_num, BigSaving);
	  heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
	  recalc_reqd = 0;
	  recalcs++;
	}
    }

  Alloc_keep_discard ();

  for (i = 0; i < discard_num; i++)
    {
      WordData *word = discard_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      discard[idx].wd[discard[idx].num_wds++] = word;
    }
  for (i = 0; i < keep_num; i++)
    {
      WordData *word = keep_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      keep[idx].wd[keep[idx].num_wds++] = word;
    }
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
  SortAndCount_DictData (&discard[0]);
  SortAndCount_DictData (&discard[1]);

  assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
  assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
  Message ("Keep -> Discard        : %8d", keep_to_discard);
  Message ("Discard -> Keep        : %8d", discard_to_keep);
  Message ("Huffman Recalculations : %8d", recalcs);
  if (recalcs == MAX_RECALCULATIONS)
    Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
}


/****************************************************************************
 *****									*****
 *****			Dictionary saving code				*****
 *****									*****
 ****************************************************************************/


static void 
Write_cdh (FILE * f, compression_dict_header * cdh)
{
  /* [RPAP - Jan 97: Endian Ordering] */
  int i;
  compression_dict_header tmp = *cdh;
  HTONUL(tmp.dict_type);
  HTONUL(tmp.novel_method);
  for (i = 0; i < TEXT_PARAMS; i++)
    HTONUL(tmp.params[i]);
  HTONUL(tmp.num_words[0]);
  HTONUL(tmp.num_words[1]);
  HTONUL(tmp.num_word_chars[0]);
  HTONUL(tmp.num_word_chars[1]);
  HTONUL(tmp.lookback);

  fwrite (&tmp, sizeof (tmp), 1, f);
}


static void 
Write_words (FILE * f, DictData * dd)
{
  int i;
  u_char *curr, *prev = NULL;
  for (i = 0; i < dd->num_wds; i++)
    {
      int len;
      curr = dd->wd[i]->word;
      if (prev)
	/* look for prefix match with prev string */
	len = prefixlen (prev, curr);
      else
	len = 0;
      fputc ((len << 4) + (curr[0] - len), f);
      fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
      prev = curr;
    }

}


static int 
Uncompressed_size (DictData * dd)
{
  int i, us;
  for (us = i = 0; i < dd->num_wds; i++)
    us += dd->wd[i]->word[0];
  return us;
}


static u_long 
Write_data (FILE * f, DictData * dd, int lookback)
{
  u_long mem_reqd;
  huff_data *hd;
  int i; 
  u_long us = dd->num_wds;
  long *freqs;
  u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
  u_long lencounts[MAX_HUFFCODE_LEN + 1];
  u_char *lastword[MAX_HUFFCODE_LEN + 1];

  if (!(freqs = Xmalloc ((dd->num_wds) * sizeof (*freqs))))
    FatalError (1, "Unable to allocate memory for freqs");

  for (i = 0; i < dd->num_wds; i++)
    {
      freqs[i] = dd->wd[i]->freq;
      us += dd->wd[i]->word[0];
    }

  if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  Xfree (freqs), freqs = NULL;

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");

  HTONUL(us);  /* [RPAP - Jan 97: Endian Ordering] */
  fwrite (&us, sizeof (us), 1, f);
  NTOHUL(us);  /* [RPAP - Jan 97: Endian Ordering] */


/* Calculate the amount of memory that will be required to store the text for
   each different huffman code len. Every 1<<lookback words for each different
   codelen length will not be prefixed by previous strings. */


  bzero ((char *) &huff_words_size, sizeof (huff_words_size));
  bzero ((char *) &lencounts, sizeof (lencounts));

  mem_reqd = 0;

  for (i = 0; i < dd->num_wds; i++)
    {
      int codelen = hd->clens[i];
      u_char *word = dd->wd[i]->word;

      if (!codelen)
	FatalError (1, "The length of a code for a word was zero");

      huff_words_size[codelen] += word[0] + 1;
      mem_reqd += word[0] + (lookback != 0);
#if 0
      if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
	lastword[codelen] = word;
      else
	huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
#else
      if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
	{
	  int save = prefixlen (lastword[codelen], word);
	  mem_reqd -= save;
	  huff_words_size[codelen] -= save;
	}
      else
	{
	  mem_reqd += sizeof (u_char *);
	}
      lastword[codelen] = word;
#endif
      lencounts[codelen]++;
    }

  /* [RPAP - Jan 97: Endian Ordering] */
  for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
    HTONUL(huff_words_size[i]);

  fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
	  hd->maxcodelen - hd->mincodelen + 1, f);

  /* [RPAP - Jan 97: Endian Ordering] */
  for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
    NTOHUL(huff_words_size[i]);

  Write_words (f, dd);

  Xfree (hd->clens);
  Xfree (hd);

  return mem_reqd;
}


static void 
Write_charfreqs (FILE * f, DictData * dd, int words,
		 int zero_freq_permitted)
{
  int j;
  long freqs[256];
  WordData **wd = dd->wd;
  huff_data *hd;

  bzero ((char *) freqs, sizeof (freqs));

  for (j = 0; j < dd->num_wds; j++, wd++)
    {
      u_char *buf = (*wd)->word;
      int len = *buf++;
      for (; len; len--, buf++)
	freqs[(u_long) (*buf)] += (*wd)->freq;
    }

  if (!zero_freq_permitted)
    for (j = 0; j < 256; j++)
      if (!freqs[j] && PESINAWORD (j) == words)
	freqs[j] = 1;

  if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");

  Xfree (hd->clens);
  Xfree (hd);
}


static void 
Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
{
  int j;
  long freqs[16];
  WordData **wd = dd->wd;
  huff_data *hd;

  bzero ((char *) freqs, sizeof (freqs));

  for (j = 0; j < dd->num_wds; j++, wd++)
    freqs[(*wd)->word[0]] += (*wd)->freq;

  if (!zero_freq_permitted)
    for (j = 0; j < 16; j++)
      if (!freqs[j])
	freqs[j] = 1;

  if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");


  Xfree (hd->clens);
  Xfree (hd);
}


static u_long 
WriteOutWords (char *file_name, u_long type, int lookback)
{
  FILE *f;
  int i;
  u_long mem_reqd = 0;

  compression_dict_header cdh;
  f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
		   MAGIC_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  bzero((char *) &cdh, sizeof(cdh));

  cdh.dict_type = type;
  cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
    MG_NOVEL_HUFFMAN_CHARS;

  cdh.num_words[1] = keep[1].num_wds;
  cdh.num_words[0] = keep[0].num_wds;
  cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
  cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
  cdh.lookback = lookback;

  Write_cdh (f, &cdh);

  for (i = 0; i < 2; i++)
    switch (type)
      {
      case MG_COMPLETE_DICTIONARY:
	{
	  mem_reqd += Write_data (f, &keep[i], lookback);
	}
	break;
      case MG_PARTIAL_DICTIONARY:
	{
	  if (keep[i].num_wds)
	    {
	      int j;
	      WordData esc;
	      esc.freq = 0;
	      esc.word = (u_char *) "";
	      keep[i].wd[keep[i].num_wds++] = &esc;
	      for (j = 0; j < discard[i].num_wds; j++)
		esc.freq += discard[i].wd[j]->freq;
	      if (!esc.freq)
		esc.freq++;
	      mem_reqd += Write_data (f, &keep[i], lookback);
	    }
	  Write_charfreqs (f, &discard[i], i, 1);
	  Write_wordlens (f, &discard[i], 1);
	}
	break;
      case MG_SEED_DICTIONARY:
	{
	  if (keep[i].num_wds)
	    {
	      int j;
	      WordData esc;
	      esc.freq = 0;
	      esc.word = (u_char *) "";
	      keep[i].wd[keep[i].num_wds++] = &esc;
	      for (j = 0; j < all[i].num_wds; j++)
		if (all[i].wd[j]->freq == 1)
		  esc.freq++;
	      if (!esc.freq)
		esc.freq++;
	      mem_reqd += Write_data (f, &keep[i], lookback);
	    }
	  switch (novel_method)
	    {
	    case MG_NOVEL_HUFFMAN_CHARS:
	      Write_charfreqs (f, &all[i], i, 0);
	      Write_wordlens (f, &all[i], 0);
	      break;
	    case MG_NOVEL_BINARY:
	      break;
	    case MG_NOVEL_DELTA:
	      break;
	    case MG_NOVEL_HYBRID:
	      break;
	    case MG_NOVEL_HYBRID_MTF:
	      break;
	    default:
	      FatalError (1, "Bad novel method");
	    }
	}
	break;
      }
  fclose (f);
  return mem_reqd;
}