/**************************************************************************
 *
 * mg_compression_dict.cpp -- Routines for creating compression dictionary
 * Copyright (C) 1994  Neil Sharman
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_compression_dict.cpp 856 2000-01-14 02:26:25Z sjboddie $
 *
 **************************************************************************/

#include "sysfuncs.h"

#include "memlib.h"
#include "messages.h"
#include "local_strings.h"
#include "bitio_gen.h"
#include "bitio_m_stdio.h"
#include "mgheap.h"
#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */

#include "mg_files.h"
#include "locallib.h"
#include "invf.h"
#include "text.h"
#include "words.h"
#include "mg.h"
#include "WordData.h"

#define MAXBITS (sizeof(unsigned long) * 8)

/*
   $Log$
   Revision 1.1  2000/01/14 02:26:11  sjboddie
   Rodgers new C++ mg

   Revision 1.3  1999/10/17 23:43:25  cs025
   Changes to eradicate Xmalloc

   Revision 1.2  1999/10/11 22:03:03  cs025
   Changed mistaken use of NTOHUL on a double to NTOHD as per report
   from Craig Neville-Manning of problems installing mg/gsdl.

   Revision 1.1  1999/10/11 02:57:46  cs025
   Base install of MG-PP

   Revision 1.1  1999/08/10 21:18:04  sjboddie
   renamed mg-1.3d directory mg

   Revision 1.4  1999/01/15 03:05:51  rjmcnab

   Renamed lib/heap to be lib/mgheap (it was causing some problems with
   some versions of the STL libraries which contained a heap.h)

   Revision 1.3  1998/12/17 09:12:52  rjmcnab

   Altered mg to process utf-8 encoded Unicode. The main changes
   are in the parsing of the input, the casefolding, and the stemming.

   Revision 1.2  1998/11/25 07:55:44  rjmcnab

   Modified mg to that you can specify the stemmer you want
   to use via a command line option. You specify it to
   mg_passes during the build process. The number of the
   stemmer that you used is stored within the inverted
   dictionary header and the stemmed dictionary header so
   the correct stemmer is used in later stages of building
   and querying.

   Revision 1.1  1998/11/17 09:34:52  rjmcnab
   *** empty log message ***

   * Revision 1.3  1994/10/20  03:56:54  tes
   * I have rewritten the boolean query optimiser and abstracted out the
   * components of the boolean query.
   *
   * Revision 1.2  1994/09/20  04:41:45  tes
   * For version 1.1
   *
 */

/* #define DEBUG1 */

/* #define DEBUG */

#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))

#define MAX_RECALCULATIONS 100

typedef struct DictWordData : public WordData
  {
    float saving;
    float char_bit_cost;
    u_long num_trans;
    u_char *word;
  }
DictWordData;

static DictWordData *Words[2];
static u_long Num[2];
static u_long chars[2];

#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)


typedef struct DictData
  {
    DictWordData **wd;
    u_long num_wds;
    u_long chars;
  }
DictData;

typedef DictData DictInfo[2];

static DictInfo keep, discard, all;
static compression_stats_header csh;

static char *file_name = "";
static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;


static void ReadInWords (char *);
static void Select_on (int k, heap_comp hc);
static void Method3 (int k);
static void Select_all (void);
static u_long WriteOutWords (char *, u_long, int);
static int DecFreqIncWL (void *a, void *b);
static int OccuranceOrder (void *a, void *b);


int main (int argc, char **argv)
{
  int ch;
  char type = 'C';
  char mode = '0';
  int lookback = 2;
  double k = 0;
  u_long mem_reqd;
  opterr = 0;
  msg_prefix = argv[0];
  while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
    switch (ch)
      {
      case 'H':
	novel_method = MG_NOVEL_HUFFMAN_CHARS;
	break;
      case 'D':
	novel_method = MG_NOVEL_DELTA;
	break;
      case 'Y':
	novel_method = MG_NOVEL_HYBRID;
	break;
      case 'f':		/* input file */
	file_name = optarg;
	break;
      case 'd':
	set_basepath (optarg);
	break;
      case 'C':
      case 'P':
      case 'S':
	type = ch;
	break;
      case '0':
      case '1':
      case '2':
      case '3':
	mode = ch;
	break;
      case 'l':
	lookback = atoi (optarg);
	if (!is_power_of_two (lookback))
	  FatalError (1, "The lookback value must be a power of 2");
	lookback = floorlog_2 (lookback);
	break;
      case 'k':
	k = atof (optarg) * 1024;
	break;
      case 'h':
      case '?':
	fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
	"[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
		 argv[0]);
	exit (1);
      }

  ReadInWords (file_name);

  if (type == 'C')
    {
      Select_all ();
      mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
    }
  else
    {
      switch (mode)
	{
	case '0':
	  Select_all ();
	  break;
	case '1':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Select_on ((int) k, OccuranceOrder);
	  break;
	case '2':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Select_on ((int) k, DecFreqIncWL);
	  break;
	case '3':
	  Message ("Dictionary limit of %.2f Kb", k / 1024);
	  Method3 ((int) k);
	  break;
	}
      if (type == 'P')
	{
	  mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
	}
      else
	{
	  mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
	}
    }

  Message ("Num words           : %8u -> %8u\n", Num[1], keep[1].num_wds);
  Message ("Num non-words       : %8u -> %8u\n", Num[0], keep[0].num_wds);
  Message ("Chars of words      : %8u -> %8u\n", chars[1], keep[1].chars);
  Message ("Chars of non-words  : %8u -> %8u\n", chars[0], keep[0].chars);
  Message ("Mem usage           : %8u -> %8u\n",
	   (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
	     (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
	   keep[0].chars + keep[1].chars);
  Message ("Actual mem required : %8u\n", mem_reqd);

  exit (0);
}


static void 
ReadInWords (char *filename)
{
  FILE *f;
  unsigned long i;
  f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
		 MAGIC_STATS_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  fread (&csh, sizeof (csh), 1, f);

  /* [RPAP - Jan 97: Endian Ordering] */
  NTOHUL(csh.num_docs);
  NTOHD(csh.num_bytes);

  for (i = 0; i < 2; i++)
    {
      frags_stats_header fsh;
      char *buf;
      DictWordData *wd;
      chars[i] = 0;

      fread (&fsh, sizeof (fsh), 1, f);
      /* [RPAP - Jan 97: Endian Ordering] */
      NTOHUL(fsh.num_frags);
      NTOHUL(fsh.mem_for_frags);

      Num[i] = all[i].num_wds = fsh.num_frags;

      /* The +1 on the following line is to leave room for the esc code. */
      all[i].wd = (DictWordData **) Xmalloc (sizeof (DictWordData *) * Num[i] + 1);

      buf = (char *) Xmalloc (fsh.mem_for_frags);
      wd = Words[i] = (DictWordData *) Xmalloc (sizeof (DictWordData) * Num[i]);
      unsigned int j;
      for (j = 0; j < Num[i]; j++, wd++)
	{
	  int len;

	  // read docCount and wordCount
	  wd->read(f);

	  // read an mgString
	  len = fgetc (f);
	  wd->word = (u_char *) buf;
	  *buf++ = len;
	  fread (buf, len, 1, f);
	  buf += len;
	  all[i].wd[j] = wd;
	}
      chars[i] = fsh.mem_for_frags - fsh.num_frags;
    }
  fclose (f);
}


static void 
Alloc_keep_discard (void)
{
  keep[0].num_wds = 0;
  keep[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
  keep[1].num_wds = 0;
  keep[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
  discard[0].num_wds = 0;
  discard[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
  discard[1].num_wds = 0;
  discard[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
}


static int 
sort_comp (const void *a, const void *b)
{
  DictWordData *aa = *(DictWordData **) a;
  DictWordData *bb = *(DictWordData **) b;
  return casecompare (aa->word, bb->word);  /* [RPAP - Jan 97: Stem Index Change] */
}


static void 
SortAndCount_DictData (DictData * dd)
{
  unsigned int i;
  DictWordData **wd;
  qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
  dd->chars = 0;
  wd = dd->wd;
  for (i = 0; i < dd->num_wds; i++, wd++)
    dd->chars += (*wd)->word[0];
}


static void 
Select_all (void)
{
  unsigned int i;
  Alloc_keep_discard ();
  keep[0].num_wds = Num[0];
  for (i = 0; i < Num[0]; i++)
    keep[0].wd[i] = Words[0] + i;
  keep[1].num_wds = Num[1];
  for (i = 0; i < Num[1]; i++)
    keep[1].wd[i] = Words[1] + i;
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
}


static int 
DecFreqIncWL (void *a, void *b)
{
  DictWordData *aa = *(DictWordData **) a;
  DictWordData *bb = *(DictWordData **) b;
  if (aa->docCount > bb->docCount)
    return -1;
  else if (aa->docCount < bb->docCount)
    return 1;
  else
    return bb->word[0] - aa->word[0];
}


static int 
OccuranceOrder (void *a, void *b)
{
  DictWordData *aa = *(DictWordData **) a;
  DictWordData *bb = *(DictWordData **) b;
  if (aa->words() > bb->words())
    return 1;
  else if (aa->words() < bb->words())
    return -1;
  else
    return 0;
}


static void 
Select_on (int k, heap_comp hc)
{
  int i, num, mem;
  DictWordData **wd;

  Alloc_keep_discard ();

  num = Num[0] + Num[1];
  wd = (DictWordData **) Xmalloc (num * sizeof (DictWordData *));
  for (i = 0; (unsigned int)i < Num[0]; i++)
    wd[i] = Words[0] + i;
  for (i = 0; (unsigned int)i < Num[1]; i++)
    wd[i + Num[0]] = Words[1] + i;

  heap_build (wd, sizeof (*wd), num, hc);

  mem = 0;
  while (k > mem && num)
    {
      int idx;
      DictWordData *word = wd[0];
#ifdef DEBUG
      fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
	       keep[0].num_wds, keep[1].num_wds,
	       mem, word->documents(), word->word, word2str (word->word));
#endif
      mem += sizeof (u_char *) + word->word[0];
      heap_deletehead (wd, sizeof (*wd), &num, hc);
      idx = KIND (word);
      keep[idx].wd[keep[idx].num_wds++] = word;
    }

  for (i = 0; i < num; i++)
    {
      DictWordData *word = wd[i];
      int idx = KIND (word);
      discard[idx].wd[discard[idx].num_wds++] = word;
    }
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
  SortAndCount_DictData (&discard[0]);
  SortAndCount_DictData (&discard[1]);

  assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
  assert (keep[1].num_wds + discard[1].num_wds == Num[1]);

}


static int 
BigSaving (void *a, void *b)
{
  DictWordData *aa = *(DictWordData **) a;
  DictWordData *bb = *(DictWordData **) b;
  return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
}

static int 
SmallSaving (void *a, void *b)
{
  DictWordData *aa = *(DictWordData **) a;
  DictWordData *bb = *(DictWordData **) b;
  return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
}


static void 
CalcCharCounts (DictWordData ** wd, int num,
		char *char_lens[2], char *len_lens[2],
		u_long escape[2])
{
  long char_freqs[2][256];
  long len_freqs[2][16];
  huff_data hd;
  int i;
  escape[0] = 0;
  escape[1] = 0;
  bzero ((char *) char_freqs, sizeof (char_freqs));
  bzero ((char *) len_freqs, sizeof (len_freqs));
  for (i = 0; i < num; i++, wd++)
    {
      u_long freq = (*wd)->documents();
      u_char *buf = (*wd)->word;
      int len = *buf++;
      int idx = KIND (*wd);
      len_freqs[idx][len] += freq;
      escape[idx] += freq;
      for (; len; len--, buf++)
	char_freqs[idx][(u_long) (*buf)] += freq;
    }
  Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
  char_lens[0] = hd.clens;
  Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
  char_lens[1] = hd.clens;

  Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
  len_lens[0] = hd.clens;
  Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
  len_lens[1] = hd.clens;
}


inline void CalcBitCostForWordData(DictWordData **word, int num,
				   double freqs_trans[2], u_long escape[2],
				   char * char_lens[2], char *len_lens[2], 
				   double esc[2], int num_trans)
{ 
  int j;

  for (j = 0; j < num; j++, word++)
    {
      float   cbc, wbc;
      u_char *buf = (*word)->word;
      int     len = *buf++;
      u_long  freq = (*word)->documents();
      int     idx = KIND (*word);

      cbc = len_lens[idx][len];
      for (; len; len--, buf++)
	cbc += char_lens[idx][(u_long) (*buf)];

      (*word)->char_bit_cost = (cbc + esc[idx]) * freq;

      wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;

      (*word)->saving = ((*word)->char_bit_cost - wbc) /
	(sizeof (char *) + (*word)->word[0]);

      (*word)->num_trans = num_trans;
    }
}


void 
CalcBitCost (DictWordData ** discard_word, int discard_num,
	     DictWordData ** keep_word, int keep_num, double freqs_trans[2],
	     u_long escape[2], int num_trans)
{
  char *char_lens[2];
  char *len_lens[2];
  double esc[2];
  CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
  esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
  esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
  CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
			 char_lens, len_lens, esc, num_trans);
  CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
			 char_lens, len_lens, esc, num_trans);
  Xfree (char_lens[0]);
  Xfree (char_lens[1]);
  Xfree (len_lens[0]);
  Xfree (len_lens[1]);
}

inline void m3_transferWord(DictWordData **toHeap, int *toNum, heap_comp toSaving,
			    DictWordData **fromHeap, int *fromNum, heap_comp fromSaving,
			    int *num_trans, double freqs_trans[], int *mem, double *total,
			    int *count)
{
  /* Transfer the word at the top of the keep heap to the
     discard heap. */
  DictWordData *word = fromHeap[0];
  int idx = KIND (word);
  heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
  toHeap[*toNum] = word;
  heap_additem (toHeap, sizeof (word), toNum, toSaving);
  freqs_trans[idx] -= word->documents();
  *mem = (*mem) - sizeof (u_char *) + word->word[0];
  *num_trans += 1;
  *total += word->saving;
  *count += 1;
#ifdef DEBUG
  fprintf (stderr,
	   "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
	   *mem, word->documents(), word->char_bit_cost,
	   word->saving, word2str (word->word));
#endif
}

inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans, 
			 double freqs_trans[], u_long escape[], heap_comp hc)
{
  DictWordData *word = wordHeap[0];
  int idx = KIND (word);
  float wbc;
#ifdef DEBUG1
  fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
	   word->saving);
#endif
  wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
    word->documents();
  word->saving = (word->char_bit_cost - wbc) /
    (sizeof (char *) + word->word[0]);
#ifdef DEBUG1
  fprintf (stderr, " %.2f\n", word->saving);
#endif
  word->num_trans = num_trans;
  heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
}


static void 
Method3 (int k)
{
  int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
  int keep_to_discard = 0;
  int discard_to_keep = 0;
  int recalcs = 0;
  double freqs_trans[2], total;
  u_long escape[2];
  DictWordData **keep_heap, **discard_heap;

  freqs_trans[0] = freqs_trans[1] = 0;
  num_trans = 0;

  discard_num = Num[0] + Num[1];
  discard_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));

  keep_num = 0;
  keep_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));


  for (i = 0; (unsigned int)i < Num[0]; i++)
    discard_heap[i] = Words[0] + i;
  for (i = 0; (unsigned int)i < Num[1]; i++)
    discard_heap[i + Num[0]] = Words[1] + i;

  heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);

  mem = 0;
  while (k > mem && discard_num)
    {
      DictWordData *word = discard_heap[0];
      mem += sizeof (u_char *) + word->word[0];
      heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
      keep_heap[keep_num++] = word;
      freqs_trans[KIND (word)] += word->documents();
      num_trans++;
    }

  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
	       freqs_trans, escape, num_trans);
  heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
  heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);

  total = 0;
  recalc_reqd = 0;
  while (keep_num && discard_num)
    {
      if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) ||
	  (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
	{
	  if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
	    {
	      m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
	    }

	  if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
	    {
	      m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
	    }
	}
      else if (keep_heap[0]->saving < discard_heap[0]->saving)
	{
	  assert (keep_num && discard_num);
	  if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
	    {
	      /* Transfer the word at the top of the keep heap to the
	         discard heap. */
	      m3_transferWord(discard_heap, &discard_num, BigSaving,
			      keep_heap, &keep_num, SmallSaving,
			      &num_trans, freqs_trans, &mem, &total, &keep_to_discard);
	    }
	  else
	    {
	      /* Transfer the word at the top of the discard heap to the
	         keep heap. */
	      m3_transferWord(keep_heap, &keep_num, SmallSaving,
			      discard_heap, &discard_num, BigSaving,
			      &num_trans, freqs_trans, &mem, &total, &discard_to_keep);
	    }

	  recalc_reqd = 1;

	}
      else
	{
	  if (recalc_reqd == 0)
	    break;
#ifdef DEBUG1
	  fprintf (stderr, "--------------\n");
#endif
	  if (recalcs == MAX_RECALCULATIONS)
	    break;
	  CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
		       freqs_trans, escape, num_trans);
	  heap_build (discard_heap, sizeof (*discard_heap),
		      discard_num, BigSaving);
	  heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
	  recalc_reqd = 0;
	  recalcs++;
	}
    }

  Alloc_keep_discard ();

  for (i = 0; i < discard_num; i++)
    {
      DictWordData *word = discard_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      discard[idx].wd[discard[idx].num_wds++] = word;
    }
  for (i = 0; i < keep_num; i++)
    {
      DictWordData *word = keep_heap[i];
      int idx = KIND (word);
      assert (IsWord (word) || IsNonWord (word));
      keep[idx].wd[keep[idx].num_wds++] = word;
    }
  SortAndCount_DictData (&keep[0]);
  SortAndCount_DictData (&keep[1]);
  SortAndCount_DictData (&discard[0]);
  SortAndCount_DictData (&discard[1]);

  assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
  assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
  Message ("Keep -> Discard        : %8d", keep_to_discard);
  Message ("Discard -> Keep        : %8d", discard_to_keep);
  Message ("Huffman Recalculations : %8d", recalcs);
  if (recalcs == MAX_RECALCULATIONS)
    Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
}


/****************************************************************************
 *****									*****
 *****			Dictionary saving code				*****
 *****									*****
 ****************************************************************************/


static void 
Write_cdh (FILE * f, compression_dict_header * cdh)
{
  /* [RPAP - Jan 97: Endian Ordering] */
  int i;
  compression_dict_header tmp = *cdh;
  HTONUL(tmp.dict_type);
  HTONUL(tmp.novel_method);
  for (i = 0; i < TEXT_PARAMS; i++)
    HTONUL(tmp.params[i]);
  HTONUL(tmp.num_words[0]);
  HTONUL(tmp.num_words[1]);
  HTONUL(tmp.num_word_chars[0]);
  HTONUL(tmp.num_word_chars[1]);
  HTONUL(tmp.lookback);

  fwrite (&tmp, sizeof (tmp), 1, f);
}


static void 
Write_words (FILE * f, DictData * dd)
{
  unsigned int i;
  u_char *curr, *prev = NULL;
  for (i = 0; i < dd->num_wds; i++)
    {
      int len;
      curr = dd->wd[i]->word;
      if (prev)
	/* look for prefix match with prev string */
	len = prefixlen (prev, curr);
      else
	len = 0;
      fputc ((len << 4) + (curr[0] - len), f);
      fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
      prev = curr;
    }

}


static int 
Uncompressed_size (DictData * dd)
{
  unsigned int i, us;
  for (us = i = 0; i < dd->num_wds; i++)
    us += dd->wd[i]->word[0];
  return us;
}


static u_long 
Write_data (FILE * f, DictData * dd, int lookback)
{
  u_long mem_reqd;
  huff_data *hd;
  int i; 
  u_long us = dd->num_wds;
  long *freqs;
  u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
  u_long lencounts[MAX_HUFFCODE_LEN + 1];
  u_char *lastword[MAX_HUFFCODE_LEN + 1];

  if (!(freqs = new long [dd->num_wds]))
    FatalError (1, "Unable to allocate memory for freqs");

  for (i = 0; (unsigned)i < dd->num_wds; i++)
    {
      freqs[i] = dd->wd[i]->documents();
      us += dd->wd[i]->word[0];
    }

  if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  delete (freqs);
  freqs = NULL;

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");

  HTONUL(us);  /* [RPAP - Jan 97: Endian Ordering] */
  fwrite (&us, sizeof (us), 1, f);
  NTOHUL(us);  /* [RPAP - Jan 97: Endian Ordering] */


/* Calculate the amount of memory that will be required to store the text for
   each different huffman code len. Every 1<<lookback words for each different
   codelen length will not be prefixed by previous strings. */


  bzero ((char *) &huff_words_size, sizeof (huff_words_size));
  bzero ((char *) &lencounts, sizeof (lencounts));

  mem_reqd = 0;

  for (i = 0; (unsigned)i < dd->num_wds; i++)
    {
      int codelen = hd->clens[i];
      u_char *word = dd->wd[i]->word;

      if (!codelen)
	FatalError (1, "The length of a code for a word was zero");

      huff_words_size[codelen] += word[0] + 1;
      mem_reqd += word[0] + (lookback != 0);
#if 0
      if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
	lastword[codelen] = word;
      else
	huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
#else
      if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
	{
	  int save = prefixlen (lastword[codelen], word);
	  mem_reqd -= save;
	  huff_words_size[codelen] -= save;
	}
      else
	{
	  mem_reqd += sizeof (u_char *);
	}
      lastword[codelen] = word;
#endif
      lencounts[codelen]++;
    }

  /* [RPAP - Jan 97: Endian Ordering] */
  for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
    HTONUL(huff_words_size[i]);

  fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
	  hd->maxcodelen - hd->mincodelen + 1, f);

  /* [RPAP - Jan 97: Endian Ordering] */
  for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
    NTOHUL(huff_words_size[i]);

  Write_words (f, dd);

  delete hd->clens;
  delete hd;

  return mem_reqd;
}


static void 
Write_charfreqs (FILE * f, DictData * dd, int words,
		 int zero_freq_permitted)
{
  unsigned int j;
  long freqs[256];
  DictWordData **wd = dd->wd;
  huff_data *hd;

  bzero ((char *) freqs, sizeof (freqs));

  for (j = 0; j < dd->num_wds; j++, wd++)
    {
      u_char *buf = (*wd)->word;
      int len = *buf++;
      for (; len; len--, buf++)
	freqs[(u_long) (*buf)] += (*wd)->documents();
    }

  if (!zero_freq_permitted)
    for (j = 0; j < 256; j++)
      if (!freqs[j] && PESINAWORD (j) == words)
	freqs[j] = 1;

  if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");

  delete hd->clens;
  delete hd;
}


static void 
Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
{
  unsigned int j;
  long freqs[16];
  DictWordData **wd = dd->wd;
  huff_data *hd;

  bzero ((char *) freqs, sizeof (freqs));

  for (j = 0; j < dd->num_wds; j++, wd++)
    freqs[(*wd)->word[0]] += (*wd)->documents();

  if (!zero_freq_permitted)
    for (j = 0; j < 16; j++)
      if (!freqs[j])
	freqs[j] = 1;

  if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
    FatalError (1, "Unable to allocate memory for huffman data");

  if (Write_Huffman_Data (f, hd) == -1)
    FatalError (1, "Unable to write huffman data");


  delete hd->clens;
  delete hd;
}


static u_long 
WriteOutWords (char *file_name, u_long type, int lookback)
{
  FILE *f;
  int i;
  u_long mem_reqd = 0;

  compression_dict_header cdh;
  f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
		   MAGIC_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  bzero((char *) &cdh, sizeof(cdh));

  cdh.dict_type = type;
  cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
    MG_NOVEL_HUFFMAN_CHARS;

  cdh.num_words[1] = keep[1].num_wds;
  cdh.num_words[0] = keep[0].num_wds;
  cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
  cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
  cdh.lookback = lookback;

  Write_cdh (f, &cdh);

  for (i = 0; i < 2; i++)
    switch (type)
      {
      case MG_COMPLETE_DICTIONARY:
	{
	  mem_reqd += Write_data (f, &keep[i], lookback);
	}
	break;
      case MG_PARTIAL_DICTIONARY:
	{
	  if (keep[i].num_wds)
	    {
	      int j;
	      DictWordData esc;
	      esc.docCount = 0;
	      esc.word = (u_char *) "";
	      keep[i].wd[keep[i].num_wds++] = &esc;
	      for (j = 0; (unsigned)j < discard[i].num_wds; j++)
		esc.docCount += discard[i].wd[j]->documents();
	      if (!esc.docCount)
		esc.docCount++;
	      mem_reqd += Write_data (f, &keep[i], lookback);
	    }
	  Write_charfreqs (f, &discard[i], i, 1);
	  Write_wordlens (f, &discard[i], 1);
	}
	break;
      case MG_SEED_DICTIONARY:
	{
	  if (keep[i].num_wds)
	    {
	      int j;
	      DictWordData esc;
	      esc.docCount = 0;
	      esc.word = (u_char *) "";
	      keep[i].wd[keep[i].num_wds++] = &esc;
	      for (j = 0; (unsigned)j < all[i].num_wds; j++)
		if (all[i].wd[j]->documents() == 1)
		  esc.docCount++;
	      if (!esc.docCount)
		esc.docCount++;
	      mem_reqd += Write_data (f, &keep[i], lookback);
	    }
	  switch (novel_method)
	    {
	    case MG_NOVEL_HUFFMAN_CHARS:
	      Write_charfreqs (f, &all[i], i, 0);
	      Write_wordlens (f, &all[i], 0);
	      break;
	    case MG_NOVEL_DELTA:
	      break;
	    case MG_NOVEL_HYBRID:
	      break;
	    default:
	      FatalError (1, "Bad novel method");
	    }
	}
	break;
      }
  fclose (f);
  return mem_reqd;
}