/**************************************************************************
 *
 * mg_fast_comp_dict.cpp -- Program to generate a fast compression dictionary
 * Copyright (C) 1994  Neil Sharman
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id: mg_fast_comp_dict.cpp 860 2000-01-18 03:53:24Z rjmcnab $
 *
 **************************************************************************/


/*
   $Log$
   Revision 1.2  2000/01/18 03:53:23  rjmcnab
   Fixed a couple of bugs and made building silent if needed.

   Revision 1.1  2000/01/14 02:26:13  sjboddie
   Rodgers new C++ mg

   Revision 1.2  1999/10/17 23:43:26  cs025
   Changes to eradicate Xmalloc

   Revision 1.1  1999/10/11 02:57:50  cs025
   Base install of MG-PP

   Revision 1.1  1999/08/10 21:18:06  sjboddie
   renamed mg-1.3d directory mg

   Revision 1.2  1998/11/25 07:55:44  rjmcnab

   Modified mg to that you can specify the stemmer you want
   to use via a command line option. You specify it to
   mg_passes during the build process. The number of the
   stemmer that you used is stored within the inverted
   dictionary header and the stemmed dictionary header so
   the correct stemmer is used in later stages of building
   and querying.

   Revision 1.1  1998/11/17 09:34:57  rjmcnab
   *** empty log message ***

   * Revision 1.3  1994/10/20  03:56:55  tes
   * I have rewritten the boolean query optimiser and abstracted out the
   * components of the boolean query.
   *
   * Revision 1.2  1994/09/20  04:41:47  tes
   * For version 1.1
   *
 */

#include "sysfuncs.h"

#include "huffman.h"
#include "messages.h"
#include "memlib.h"
#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */

#include "local_strings.h"
#include "mg.h"
#include "text.h"
#include "invf.h"
#include "mg_files.h"
#include "locallib.h"
#include "words.h"


#define ALIGN_SIZE(p,t) (((p) + (sizeof(t)-1)) & (~(sizeof(t)-1)))

#define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*))
#define FIXUP(p) (fixup[WORDNO(p,buffer)/8] |= (1<<(WORDNO(p, buffer) & 7)))

#define IS_FIXUP(p) ((fixup[WORDNO(p,buffer)/8] & (1<<(WORDNO(p, buffer) & 7))) != 0)

#define FIXUP_VALS(vals) do {						\
	int i;								\
	for (i=0; i < MAX_HUFFCODE_LEN+1; i++)				\
	  FIXUP(&vals[i]);						\
      } while(0)


static u_long mem_for_comp_dict (char *filename);
static void load_comp_dict (char *filename);
static void save_fast_dict (char *filename);
static void unfixup_buffer (void);


static void *buffer;
static void *cur;
static u_char *fixup;
static u_long mem, fixup_mem;

int main (int argc, char **argv)
{
  int ch;
  char *filename = "";
  opterr = 0;
  msg_prefix = argv[0];
  while ((ch = getopt (argc, argv, "f:d:h")) != -1)
    switch (ch)
      {
      case 'f':		/* input file */
	filename = optarg;
	break;
      case 'd':
	set_basepath (optarg);
	break;
      case 'h':
      case '?':
	fprintf (stderr, "usage: %s [-f input_file] [-d data directory] [-h]\n",
		 argv[0]);
	exit (1);
      }

  mem = mem_for_comp_dict (filename);

  fixup_mem = (ALIGN_SIZE (mem, u_char *) / sizeof (u_char *) + 7) / 8;

  cur = buffer = Xmalloc (mem);
  bzero (buffer, mem);
  fixup = (u_char *) Xmalloc (fixup_mem);
  bzero (fixup, fixup_mem);

#ifndef SILENT
  Message ("Estimated memory for fast_dict %u", mem);
  Message ("Estimated memory for fixups %u", fixup_mem);
#endif

  load_comp_dict (filename);

#ifndef SILENT
  Message ("Actual memory for fast_dict %u", (char *) cur - (char *) buffer);
#endif

  if ((u_long) cur > (u_long) buffer + mem)
    FatalError (1, "The buffer was not big enough for the dictionary");

  {
    /* [RPAP - Jan 97: Endian Ordering] */
    compression_dict *cd = (compression_dict *) buffer;
    int i, which;

    /* cfh */
    for (which = 0; which <= 1; which++)
      {
	int j;
	
	HTONSI(cd->cfh[which]->hd.num_codes);
	HTONSI(cd->cfh[which]->hd.mincodelen);
	HTONSI(cd->cfh[which]->hd.maxcodelen);
	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
	  {
	    HTONSI(cd->cfh[which]->hd.lencount[j]);
	    HTONUL(cd->cfh[which]->hd.min_code[j]);
	  }
	HTONUL(cd->cfh[which]->uncompressed_size);
	for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++)
	  HTONUL(cd->cfh[which]->huff_words_size[j]);
      }
    HTONUL(cd->MemForCompDict);
    /* ad */
    if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
	cd->cdh.novel_method == MG_NOVEL_HYBRID)
      for (which = 0; which <= 1; which++)
	{
	  int j;
	  
	  HTONUL(cd->ad->afh[which].num_frags);
	  HTONUL(cd->ad->afh[which].mem_for_frags);
	  for (j = 0; j < 33; j++)
	    {
	      HTONSI(cd->ad->blk_start[which][j]);
	      HTONSI(cd->ad->blk_end[which][j]);
	    }
	}
    /* cdh */
    HTONUL(cd->cdh.dict_type);
    HTONUL(cd->cdh.novel_method);
    for (i = 0; i < TEXT_PARAMS; i++)
      HTONUL(cd->cdh.params[which]);
    HTONUL(cd->cdh.num_words[0]);
    HTONUL(cd->cdh.num_words[1]);
    HTONUL(cd->cdh.num_word_chars[0]);
    HTONUL(cd->cdh.num_word_chars[1]);
    HTONUL(cd->cdh.lookback);
    HTONSI(cd->fast_loaded);
  }

  unfixup_buffer ();

  save_fast_dict (filename);

  exit (0);
}


static void 
unfixup_buffer ()
{
  u_long *p;
  for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
    {
      if (IS_FIXUP (p))
	*p = *p - (u_long) buffer;
    }
}


static u_long 
mem_for_aux_dict (compression_dict_header * /*cdh*/, char *filename)
{
  int i;
  u_long mem = sizeof (auxiliary_dict);
  FILE *aux;

  aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
		   MAGIC_AUX_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  for (i = 0; i <= 1; i++)
    {
      aux_frags_header afh;
      fread (&afh, sizeof (afh), 1, aux);
      NTOHUL(afh.num_frags);  /* [RPAP - Jan 97: Endian Ordering] */
      NTOHUL(afh.mem_for_frags);  /* [RPAP - Jan 97: Endian Ordering] */
      mem += afh.num_frags * sizeof (u_char *);
      mem = ALIGN_SIZE (mem + afh.mem_for_frags, u_char *);
      fseek (aux, afh.mem_for_frags, SEEK_CUR);
    }
  fclose (aux);

  return mem;
}


static u_long 
mem_for_words (FILE * dict, compression_dict_header * cdh,
	       comp_frags_header * cfh)
{
  u_long mem = 0;
  long i, lookback;
  int ptrs_reqd = 0;
  int mem_reqd = 0;

  lookback = cdh->lookback;

  for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
    {
      ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
      mem_reqd += cfh->huff_words_size[i];
    }

  mem += ptrs_reqd * sizeof (u_char *);
  mem += (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **);
  mem += mem_reqd;

  for (i = 0; i < cfh->hd.num_codes; i++)
    {
      register int val;
      for (val = getc (dict) & 0xf; val; val--)
	getc (dict);
    }
  return ALIGN_SIZE (mem, u_char *);
}

static u_long mem_skip_hd(FILE *dict, u_long mem)
{ huff_data hd;

  mem += sizeof (hd);
  Read_Huffman_Data (dict, &hd, NULL, NULL);
  if (hd.clens)
    delete hd.clens;
  mem += hd.num_codes * sizeof (unsigned long);
  mem += (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);
  return mem;
}

static u_long mem_read_cfh(FILE *dict, compression_dict_header *cdh, comp_frags_header *cfh, 
			   u_long mem)
{
  mem += sizeof (comp_frags_header);
  Read_cfh (dict, cfh, NULL, NULL);

  /* Don't need to count the space for the clens of the huffman data */

  mem += mem_for_words (dict, cdh, cfh);
  if (cfh->hd.clens)
    delete cfh->hd.clens;

  return mem;
}


static u_long 
mem_for_comp_dict (char *filename)
{
  u_long mem = sizeof (compression_dict);
  compression_dict_header cdh;
  comp_frags_header cfh;
  FILE *dict;
  int which;
  int i;  /* [RPAP - Jan 97: Endian Ordering] */

  dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
		    MAGIC_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  fread (&cdh, sizeof (cdh), 1, dict);
  /* [RPAP - Jan 97: Endian Ordering] */
  NTOHUL(cdh.dict_type);
  NTOHUL(cdh.novel_method);
  for (i = 0; i < TEXT_PARAMS; i++)
    NTOHUL(cdh.params[i]);
  NTOHUL(cdh.num_words[0]);
  NTOHUL(cdh.num_words[1]);
  NTOHUL(cdh.num_word_chars[0]);
  NTOHUL(cdh.num_word_chars[1]);
  NTOHUL(cdh.lookback);

  for (which = 0; which < 2; which++)
    switch (cdh.dict_type)
      {
      case MG_COMPLETE_DICTIONARY:
	{
	  mem = mem_read_cfh(dict, &cdh, &cfh, mem);
	}
	break;
      case MG_PARTIAL_DICTIONARY:
	{
	  // huff_data hd;
	  if (cdh.num_words[which])
	    {
	      mem = mem_read_cfh(dict, &cdh, &cfh, mem);
	    }

	  mem = mem_skip_hd(dict, mem);
	  mem = mem_skip_hd(dict, mem);
	}
	break;
      case MG_SEED_DICTIONARY:
	{
	  // huff_data hd;
	  if (cdh.num_words[which])
	    {
	      mem = mem_read_cfh(dict, &cdh, &cfh, mem);
	    }
	  switch (cdh.novel_method)
	    {
	    case MG_NOVEL_HUFFMAN_CHARS:
	      mem = mem_skip_hd(dict, mem);
	      mem = mem_skip_hd(dict, mem);
	      break;
	    case MG_NOVEL_DELTA:
	      break;
	    case MG_NOVEL_HYBRID:
	      break;
	    }
	  break;
	}
      }
  fclose (dict);

  if (cdh.novel_method == MG_NOVEL_DELTA ||
      cdh.novel_method == MG_NOVEL_HYBRID)
    mem += mem_for_aux_dict (&cdh, filename);

  return ALIGN_SIZE (mem, u_char *);
}


void *
getmem (u_long size, int align)
{
  void *res;
  cur = (void *) (((u_long) cur + (align - 1)) & (~(align - 1)));
  res = cur;
  cur = (char *) cur + size;
  if ((u_long) cur > (u_long)buffer + mem)
    FatalError (1, "The buffer was not big enough for the dictionary");
  return res;
}


static auxiliary_dict *
LoadAuxDict (compression_dict_header * cdh,
	     char *filename)
{
  auxiliary_dict *ad;
  int i;
  FILE *aux;

  aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb",
		   MAGIC_AUX_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  ad = (auxiliary_dict *) getmem (sizeof (auxiliary_dict), sizeof (u_char *));

  for (i = 0; i <= 1; i++)
    {
      unsigned int j;
      u_char *pos;

      fread (&ad->afh[i], sizeof (aux_frags_header), 1, aux);

      /* [RPAP - Jan 97: Endian Ordering] */
      NTOHUL(ad->afh[i].num_frags);
      NTOHUL(ad->afh[i].mem_for_frags);

      ad->word_data[i] = (u_char *) getmem (ad->afh[i].mem_for_frags, 1);
      FIXUP (&ad->word_data[i]);

      ad->words[i] = (u_char **) getmem (ad->afh[i].num_frags * sizeof (u_char *),
			     sizeof (u_char *));
      FIXUP (&ad->words[i]);

      fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), aux);

      pos = ad->word_data[i];
      for (j = 0; j < ad->afh[i].num_frags; j++)
	{
	  ad->words[i][j] = pos;
	  FIXUP (&ad->words[i][j]);
	  pos += *pos + 1;
	}
      if (cdh->novel_method == MG_NOVEL_HYBRID)
	{
	  int num;
	  num = 1;
	  ad->blk_start[i][0] = 0;
	  ad->blk_end[i][0] = cdh->num_words[i] - 1;
	  while (num < 33)
	    {
	      ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1;
	      ad->blk_end[i][num] = ad->blk_start[i][num] +
		(ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2;
	      num++;
	    }
	}
    }
  fclose (aux);
  return (ad);
}


static u_char ***
ReadInWords (FILE * dict, compression_dict * cd,
	     comp_frags_header * cfh, u_char ** escape)
{
  int i, lookback;
  int ptrs_reqd = 0;
  int mem_reqd = 0;
  int num_set[MAX_HUFFCODE_LEN + 1];
  u_char *next_word[MAX_HUFFCODE_LEN + 1];
  u_char **vals;
  u_char ***values;
  u_char word[MAXWORDLEN + 1];
  u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1];

  lookback = cd->cdh.lookback;

  for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++)
    {
      ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback;
      mem_reqd += cfh->huff_words_size[i];
    }

  vals = (u_char **) getmem (ptrs_reqd * sizeof (*vals), sizeof (u_char *));

  values = (u_char ***) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **), sizeof (u_char **));

  next_word[0] = (u_char *) getmem (mem_reqd, sizeof (char));

  cd->MemForCompDict += ptrs_reqd * sizeof (*vals) +
    (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) +
    mem_reqd;

  values[0] = vals;
  FIXUP (&values[0]);
  values[0][0] = next_word[0];
  FIXUP (&values[0][0]);
  for (i = 1; i <= cfh->hd.maxcodelen; i++)
    {
      int next_start = (values[i - 1] - vals) +
      ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback);
      values[i] = &vals[next_start];
      FIXUP (&values[i]);
      next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1];
      values[i][0] = next_word[i];
      FIXUP (&values[i][0]);
    }

  bzero ((char *) num_set, sizeof (num_set));

  for (i = 0; i < cfh->hd.num_codes; i++)
    {
      register int val, copy;
      register int len = cfh->hd.clens[i];
      val = getc (dict);
      copy = (val >> 4) & 0xf;
      val &= 0xf;

      fread (word + copy + 1, sizeof (u_char), val, dict);
      *word = val + copy;

      if ((num_set[len] & ((1 << lookback) - 1)) == 0)
	{
	  values[len][num_set[len] >> lookback] = next_word[len];
	  FIXUP (&values[len][num_set[len] >> lookback]);
	  memcpy (next_word[len], word, *word + 1);
	  if (escape && i == cfh->hd.num_codes - 1)
	    {
	      *escape = next_word[len];
	      FIXUP (escape);
	    }
	  next_word[len] += *word + 1;
	}
      else
	{
	  copy = prefixlen (last_word[len], word);
	  memcpy (next_word[len] + 1, word + copy + 1, *word - copy);
	  *next_word[len] = (copy << 4) + (*word - copy);
	  if (escape && i == cfh->hd.num_codes - 1)
	    {
	      *escape = next_word[len];
	      FIXUP (escape);
	    }
	  next_word[len] += (*word - copy) + 1;
	}
      memcpy (last_word[len], word, *word + 1);
      num_set[len]++;
    }
  if (cfh->hd.clens)
    delete cfh->hd.clens;
  cfh->hd.clens = NULL;
  return values;
}


static unsigned long **
Generate_Fast_Huffman_Vals (huff_data * data, u_long * mem)
{
  int i;
  unsigned long *fcode[MAX_HUFFCODE_LEN + 1];
  unsigned long **values;
  unsigned long *vals;

  if (!data)
    return (NULL);
  vals = (unsigned long *) getmem (data->num_codes * sizeof (*vals), sizeof (long *));
  values = (unsigned long **) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *),
		   sizeof (long *));

  bzero ((char *) values, (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *));

  if (mem)
    *mem += data->num_codes * sizeof (*vals) +
      (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *);

  fcode[0] = values[0] = &vals[0];
  FIXUP (&values[0]);
  for (i = 1; i <= data->maxcodelen; i++)
    {
      fcode[i] = values[i] = &vals[(values[i - 1] - vals) + data->lencount[i - 1]];
      FIXUP (&values[i]);
    }

  for (i = 0; i < data->num_codes; i++)
    if (data->clens[i])
      *fcode[(int) (data->clens[i])]++ = i;
  return (values);
}


static void load_read_huffman(FILE *dict, compression_dict *cd, int which)
{
  huff_data *hd;
  u_long   **vals;

  hd = (huff_data *) getmem (sizeof (huff_data), sizeof (char *));
  cd->MemForCompDict += sizeof (huff_data);
  Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL);
  vals = Generate_Fast_Huffman_Vals (hd, &cd->MemForCompDict);
  cd->chars_huff[which] = hd;
  FIXUP (&cd->chars_huff[which]);
  cd->chars_vals[which] = vals;
  FIXUP (&cd->chars_vals[which]);
  if (hd->clens)
    delete hd->clens;
  hd->clens = NULL;
}

static void load_read_words(FILE *dict, compression_dict *cd, int which, int fixescape)
{
  cd->cfh[which] = (comp_frags_header *) getmem (sizeof (*cd->cfh[which]), sizeof (u_char *));
  cd->MemForCompDict += sizeof (*cd->cfh[which]);
  Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL);

  cd->values[which] = ReadInWords (dict, cd, cd->cfh[which], NULL);
  FIXUP (&cd->cfh[which]);
  FIXUP (&cd->values[which]);
  if (fixescape)
    { FIXUP(&cd->escape[which]);
    }
  else
    {
      cd->escape[which] = NULL;
    }
}

static void 
load_comp_dict (char *filename)
{
  FILE *dict;
  int which;
  compression_dict *cd;

  cd = (compression_dict *) getmem (sizeof (compression_dict), sizeof (u_char *));
  cd->fast_loaded = 1;

  dict = open_file (filename, TEXT_DICT_SUFFIX, "rb",
		    MAGIC_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  Read_cdh (dict, &cd->cdh, NULL, NULL);

  for (which = 0; which < 2; which++)
    switch (cd->cdh.dict_type)
      {
      case MG_COMPLETE_DICTIONARY:
	{
	  load_read_words(dict, cd, which, 0);
	}
	break;
      case MG_PARTIAL_DICTIONARY:
	{
	  if (cd->cdh.num_words[which])
	    {
	      load_read_words(dict, cd, which, 1);
	    }

	  load_read_huffman(dict, cd, which);
	  load_read_huffman(dict, cd, which);
	}
	break;
      case MG_SEED_DICTIONARY:
	{
	  if (cd->cdh.num_words[which])
	    {
	      load_read_words(dict, cd, which, 1);
	    }
	  switch (cd->cdh.novel_method)
	    {
	    case MG_NOVEL_HUFFMAN_CHARS:
	      load_read_huffman(dict, cd, which);
	      load_read_huffman(dict, cd, which);
	      break;
	    case MG_NOVEL_DELTA:
	      break;
	    case MG_NOVEL_HYBRID:
	      break;
	    }
	  break;
	}
      }
  fclose (dict);


  if (cd->cdh.novel_method == MG_NOVEL_DELTA ||
      cd->cdh.novel_method == MG_NOVEL_HYBRID)
    {
      cd->ad = LoadAuxDict (&cd->cdh, filename);
      FIXUP (&cd->ad);
    }
}


static void 
save_fast_dict (char *filename)
{
  FILE *fdict;

  fdict = create_file (filename, TEXT_DICT_FAST_SUFFIX, "wb",
		       MAGIC_FAST_DICT, MG_ABORT);  /* [RPAP - Feb 97: WIN32 Port] */

  {
    u_long *p;
    for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++)
      {
	if (IS_FIXUP (p))
	  HTONUL(*p);
      }
  }

  /* [RPAP - Jan 97: Endian Ordering] */
  HTONUL(mem);
  HTONUL(fixup_mem);

  fwrite (&mem, sizeof (mem), 1, fdict);
  fwrite (&fixup_mem, sizeof (fixup_mem), 1, fdict);

  /* [RPAP - Jan 97: Endian Ordering] */
  NTOHUL(mem);
  NTOHUL(fixup_mem);

  fwrite (buffer, sizeof (u_char), mem, fdict);
  fwrite (fixup, sizeof (u_char), fixup_mem, fdict);

  fclose (fdict);
}