/**************************************************************************
 *
 * bool_query.c -- Boolean query evaluation
 * Copyright (C) 1994  Neil Sharman & Tim Shimmin
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *
 **************************************************************************/

#include "sysfuncs.h"

#include "memlib.h"
#include "messages.h"
#include "filestats.h"
#include "timing.h"
#include "sptree.h"


#include "mg.h"
#include "text.h"
#include "invf.h"
#include "lists.h"
#include "term_lists.h"
#include "backend.h"
#include "words.h"
#include "stemmer.h"
#include "stem_search.h"
#include "invf_get.h"
#include "text_get.h"
#include "read_line.h"
#include "bool_tree.h"
#include "bool_parser.h"
#include "bool_optimiser.h"
#include "environment.h"


/**********************
  [RPAP - Feb 97: Term Frequency]

  Shifted to bool_parser.y

  *********************/

/* =========================================================================
 * Function: ReadInTermInfo
 * Description: 
 * Input: 
 * Output: 
 * ========================================================================= */
/**************************
static void 
ReadInTermInfo (query_data * qd)
{
  int i;
  TermList *tl = qd->TL;

  for (i = 0; i < tl->num; i++)
    {
      TermEntry *te = &(tl->TE[i]);
      WordEntry *we = &(te->WE);

       we->word_num = FindWord (qd->sd, te->Word,
			       &we->count, &we->doc_count,
			       &we->invf_ptr, &we->invf_len);
      if (we->word_num == -1)
	we->count = we->doc_count = 0;
    }
}

***********************/


/* =========================================================================
 * Function: FindNoneTerms 
 * Description: 
 *      Set any terms which are not in dictionary to N_none (i.e. false)
 *      Assumes a prior call to ReadInTermInfo
 * Input: 
 * Output: 
 * ========================================================================= */

static void 
FindNoneTerms (query_data * qd, bool_tree_node * tree)
{
  if (BOOL_HAS_CHILD (tree))
    {
      bool_tree_node *child = BOOL_CHILD (tree);
      while (child)
	{
	  FindNoneTerms (qd, child);
	  child = BOOL_SIBLING (child);
	}
    }
  else if (BOOL_TAG (tree) == N_term)
    {
      int doc_count = qd->TL->TE[BOOL_TERM (tree)].WE.doc_count;
      if (!doc_count)
	BOOL_TAG (tree) = N_none;
    }

}				/*FindNoneTerms */

/* =========================================================================
 * Function: ReduceList
 * Description: 
 *      Go thru doc-set and get rid of all docs which are unmarked
 *      The motivation for this routine is in the case of CNF queries.
 * Input: 
 * Output: 
 * ========================================================================= */

void
ReduceList (DocList * cset)
{
  DocEntry *src;
  DocEntry *dest;

  src = dest = cset->DE;
  for (src = dest = cset->DE; src < cset->DE + cset->num; src++)
    {
      if (src->or_included)
	{
	  src->or_included = 0;
	  *dest++ = *src;
	}
    }
  cset->num = dest - cset->DE;

}				/*ReduceList */

/* =========================================================================
 * Function: BooleanGet
 * Description: 
 * Input: 
 * Output: 
 * ========================================================================= */
static DocList *
BooleanGet (query_data * qd, bool_tree_node * n,
	    BooleanQueryInfo * bqi)
{
  if (n)
    {
      DocList *res = NULL;
      switch (BOOL_TAG (n))
	{
	case N_all:
	  Message ("Unexpected \"all\" node in the parse tree.");
	  res = MakeDocList (0);
	  break;
	case N_not:
	  Message ("Unexpected \"not\" node in the parse tree.");
	  res = MakeDocList (0);
	  break;
	case N_none:
	  {
	    res = MakeDocList (0);
	    break;
	  }
	case N_diff:
	  {
	    res = BooleanGet (qd, BOOL_CHILD (n), bqi);
	    n = BOOL_SIBLING (BOOL_CHILD (n));
	    while (n)
	      {
		bool_tree_node *next = BOOL_SIBLING (n);
		if (BOOL_TAG (n) == N_term)
		  {
		    WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (n));
		    res = GetDocsOp (qd, we, op_diff_terms, res);
		  }
		else
		  res = DiffLists (res, BooleanGet (qd, n, bqi));
		n = next;
	      }
	    break;
	  }
	case N_and:
	  {
	    res = BooleanGet (qd, BOOL_CHILD (n), bqi);		/* initial fill of c-set */
	    n = BOOL_SIBLING (BOOL_CHILD (n));
	    while (n)
	      {
		bool_tree_node *next = BOOL_SIBLING (n);
		switch (BOOL_TAG (n))
		  {
		  case N_term:
		    {
		      WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (n));
		      res = GetDocsOp (qd, we, op_and_terms, res);
		    }
		    break;
		  case N_or_terms:
		    {
		      bool_tree_node *child = BOOL_CHILD (n);
		      /* note: all children are terms - predetermined */
		      while (child)
			{
			  WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (child));
			  res = GetDocsOp (qd, we, op_and_or_terms, res);
			  child = BOOL_SIBLING (child);
			}
		      ReduceList (res);
		    }
		    break;
		  default:
		    res = IntersectLists (res, BooleanGet (qd, n, bqi));
		  }		/*switch */
		n = next;
	      }
	    break;
	  }
	case N_or:
	case N_or_terms:
	  {
	    res = BooleanGet (qd, BOOL_CHILD (n), bqi);
	    n = BOOL_SIBLING (BOOL_CHILD (n));
	    while (n)
	      {
		bool_tree_node *next = BOOL_SIBLING (n);
		res = MergeLists (res, BooleanGet (qd, n, bqi));
		n = next;
	      }
	    break;
	  }
	case N_term:
	  {
	    res = GetDocsOp (qd, GetNthWE (qd->TL, BOOL_TERM (n)), op_term, NULL);
	    break;
	  }
	}
      return (res ? res : MakeDocList (0));
    }
  else
    return MakeDocList (0);
}

/* =========================================================================
 * Function: AdjustParaDocs
 * Description: 
 * Input: 
 * Output: 
 * ========================================================================= */
static int 
DE_comp (void *a, void *b)
{
  return ((DocEntry *) a)->SeekPos - ((DocEntry *) b)->SeekPos;
}

static void
AdjustParaDocs (query_data * qd, DocList * Docs)
{
  DocEntry *DE = Docs->DE;
  int s, d;
  Splay_Tree *SP;


  /* Make the doc-list contain only references to unique
   * real documents. So can get rid of cases where there
   * are multiple paragraphs for a real document.
   */

  /* Note: [TS: 29/Sep/94] */
  /* reads in seek-pos for docnums so can compare docs */
  /* seek-pos used as real doc identifier */
  /* whereas doc-num entries refer to paragraphs -- I think */
  for (s = 0; s < Docs->num; s++)
    FetchDocStart (qd, DE[s].DocNum, &DE[s].SeekPos, &DE[s].Len);

  Message ("Original Number = %d\n", Docs->num);
  SP = SP_createset (DE_comp);
  d = 0;
  for (s = 0; s < Docs->num; s++)
    if (!SP_member (&DE[s], SP))
      {
	DE[d] = DE[s];
	SP_insert (&DE[d], SP);
	d++;
      }
  SP_freeset (SP);
  Docs->num = d;
  Message ("Final Number = %d\n", Docs->num);
}

/* =========================================================================
 * Function: BooleanQuery 
 * Description: 
 *      The main control function for the boolean query.
 * Input: 
 * Output: 
 *      qd->DL
 * ========================================================================= */

void 
BooleanQuery (query_data * qd, char *Query, BooleanQueryInfo * bqi,
	      int stem_method)
{
  bool_tree_node *tree;
  DocList *Docs;
  int res = 0;

  tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL), 
		    qd->sd->sdh.stemmer_num, stem_method, &res,
		    qd->sd, qd->sd->sdh.indexed,         /* [RPAP - Jan 97: Stem Index Change] */
		    &(qd->QTL));                         /* [RPAP - Feb 97: Term Frequency] */

  /* [TS:Aug/94] no longer tries to continue after parse failure */
  if (res != 0)			/* failed the parse */
    {
      qd->DL = NULL;		/* have empty doc-list */
      return;
    }

  /* [RPAP - Feb 97: Term Frequency] */
  /*  ReadInTermInfo (qd); */

  FindNoneTerms (qd, tree);

  OptimiseBoolTree (tree, qd->TL, atoi (GetEnv ("optimise_type")));

  qd->hops_taken = qd->num_of_ptrs = 0;
  qd->num_of_terms = 0;

  Docs = BooleanGet (qd, tree, bqi);

/* [RPAP - Feb 97: NZDL Specifics] */
#if !defined(PARABOOLEANMATCHES) && !defined(NZDL)
  if (qd->id->ifh.InvfLevel == 3)
    {
      AdjustParaDocs (qd, Docs);
    }
#endif

  if (bqi->MaxDocsToRetrieve != -1 && Docs->num > bqi->MaxDocsToRetrieve)
    Docs->num = bqi->MaxDocsToRetrieve;

  FreeQueryDocs (qd);

  qd->DL = Docs;
}