Context Navigation

← Previous Change
Next Change →

Changeset 3791 for trunk/indexers/mg

Timestamp:

2003-03-05T13:45:43+13:00 (21 years ago)

Author:

mdewsnip

Message:

"Some" queries are now performed as ranked queries rather than boolean OR queries. Equivalent terms for each of the query terms are also recorded.

Location:

trunk/indexers/mg/jni

Files:

: 3 edited

MGWrapperImpl.c (modified) (14 diffs)
MGWrapperImpl.h (modified) (2 diffs)
org_greenstone_mg_MGWrapper.h (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/indexers/mg/jni/MGWrapperImpl.c

-              r3743
+              r3791
 jmethodID MID_addDoc = NULL; /* MGQueryResult.addDoc() */
 jmethodID MID_addTerm = NULL; /* MGQueryResult.addTerm() */
+jmethodID MID_addEquivTerm = NULL; /* MGQueryResult.addEquivTerm() */
 jmethodID MID_setTotalDocs = NULL; /* MGQueryResult.setTotalDocs() */
 jmethodID MID_clearResult = NULL; /* MGQueryResult.clear() */
 …
   assert(MID_addDoc != NULL);
   /* addTerm(String term, String tag, int stem, long match, long freq, String[] equivs) */
   MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;Ljava/lang/String;IJJ[Ljava/lang/String;)V");
+  /* addTerm(String term, int stem) */
+  MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;I)V");
   assert(MID_addTerm != NULL);
+  /* addEquivTerm(String term, String equivTerm, long match, long freq) */
+  MID_addEquivTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addEquivTerm", "(Ljava/lang/String;Ljava/lang/String;JJ)V");
+  assert(MID_addEquivTerm != NULL);
   /* setTotalDocs(long) */
 …
   data->queryInfo->maxDocs = 50;
   data->queryInfo->needTermFreqs = 1;
-  data->queryInfo->sortByRank = 1;
   /* Save the object on the Java side */
 …
   /* Load the appropriate index for satisfying this request */
+  printf("Document retrieval, index path: %s\n", index_path);
   qd = loadIndexData((char*) base_dir, (char*) index_path, (char*) text_path);
 …
   Fread (c_buffer, 1, len, qd->td->TextFile);
   /* Decode (?) the document text into another buffer, and terminate it */
+  /* Decompress the document text into another buffer, and terminate it */
   DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen);
   uc_buffer[ULen] = '\0';
 …
    do a query
  *******************************************/
-/* Comparison function for sorting documents by their weight (decreasing order) */
-int
-document_weight_comp(const void *A, const void *B)
+{
-  const DocEntry *a = A;
-  const DocEntry *b = B;
-  /* Compare on weight, highest wins */
-  if (a->Weight < b->Weight)
-    return  1;
-  if (a->Weight > b->Weight)
-    return -1;
-  return 0;
+}
 /* do the actual query - the results are written to query_result held on the Java side */
 …
   jobject result_ptr;
   char* query;
-  BooleanQueryInfo bqi;
   int i, j;
 …
+  }
   /* Boolean OR ("some") queries: must manually insert OR ("|") tokens */
+  /* "Some" queries are done as ranked queries */
   if (data->defaultBoolCombine == 0) {
+    int in_space = 0;
+    for (i = 0; i < strlen(query); i++) {
+      if (in_space) {
+    if (query[i] == '|') /* OR character already inserted, so remove others */
+      query[i] = ' ';
+    else if (!isspace(query[i])) /* moving out of a space region */
+      in_space = 0;
+      }
+      else if (!in_space && isspace(query[i])) { /* moving into a space region */
+    in_space = 1;
+    query[i] = '|'; /* insert an OR character */
+      }
+    }
+    printf("Boolean OR query string: %s\n", query);
+  }
+  /* If the documents need to be sorted by rank, all of them must be retrieved */
+  bqi.MaxDocsToRetrieve = ((data->queryInfo->sortByRank) ? -1 : data->queryInfo->maxDocs);
+  /* Perform query */
+  /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
+  BooleanQuery(qd, (char*) query, &bqi, data->defaultStemMethod);
+    RankedQueryInfo rqi;
+    rqi.QueryFreqs = 1;  /* Use the frequency of each query term in the query - OK? */
+    rqi.Exact = 1;  /* Perform exact ranking */
+    rqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;  /* Get only the desired number */
+    rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;  /* OK? */
+    rqi.Sort = 1;  /* Sort the query terms by frequency before ranking */
+    rqi.AccumMethod = 'L';  /* Use a list when accumulating (has bugs though...) */
+    /* rqi.MaxAccums = -1;  /* Use as many accumulators as necessary - CRASHES with list */
+    rqi.MaxAccums = 100000;
+    rqi.MaxTerms = -1;  /* Use all the query terms */
+    /* rqi.StopAtMaxAccum = 0;  /* Don't care (using as many accumulators as necessary) */
+    rqi.StopAtMaxAccum = 1;
+    rqi.HashTblSize = 1000;  /* Don't care (not using a hash table) */
+    rqi.skip_dump = NULL;  /* Don't dump skip information */
+    /* RankedQuery() reads 'casefold' and 'stem' parameters from the environment */
+    SetEnv("casefold", ((data->defaultStemMethod & 1) ? "on" : "off"), NULL);
+    SetEnv("stem", ((data->defaultStemMethod & 2) ? "on" : "off"), NULL);
+    RankedQuery(qd, query, &rqi);
+  }
+  /* "All" queries are done as boolean queries */
+  else {
+    BooleanQueryInfo bqi;
+    bqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;
+    /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
+    BooleanQuery(qd, query, &bqi, data->defaultStemMethod);
+  }
   /* Finished with the C query string */
 …
     (*j_env)->ExceptionDescribe(j_env);
     return;
+  }
-  /* Sort the documents by rank, if desired */
-  if (data->queryInfo->sortByRank) {
-    qsort(qd->DL->DE, qd->DL->num, sizeof(DocEntry), document_weight_comp);
+  }
 …
   /* Record the term information, if desired */
   if (data->queryInfo->needTermFreqs) {
+    /* The following code is a lot more complicated than it could be, but it is necessary
+       to compensate for an oddity in MG. */
+    unsigned char** stemmed_terms = malloc(sizeof(unsigned char*) * qd->TL->num);
     printf("Number of terms: %d\n", qd->TL->num);
     printf("Number of query terms: %d\n", qd->QTL->num);
+    /* Find each query term in the term list, and grab its frequency */
+    /* Generate the stemmed form of each of the relevant terms */
+    for (i = 0; i < qd->TL->num; i++) {
+      u_char* raw_term = qd->TL->TE[i].Word;
+      unsigned int term_length = raw_term[0];
+      u_char* raw_stemmed_term = malloc(term_length + 1);
+      unsigned int stemmed_term_length;
+      /* Copy the term, and stem it */
+      for (j = 0; j <= term_length; j++)
+    raw_stemmed_term[j] = raw_term[j];
+      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_term);
+      /* Allocate memory to store the stemmed term, and fill it */
+      stemmed_term_length = raw_stemmed_term[0];
+      stemmed_terms[i] = malloc(stemmed_term_length + 1);
+      assert(stemmed_terms[i] != NULL);
+      strncpy(stemmed_terms[i], &(raw_stemmed_term[1]), stemmed_term_length);
+      stemmed_terms[i][stemmed_term_length] = '\0';
+    }
+    /* Record every query term, along with their equivalent terms */
     for (i = 0; i < qd->QTL->num; i++) {
+      unsigned int query_term_length = (unsigned int) qd->QTL->QTE[i].Term[0];
+      u_char* raw_query_term = qd->QTL->QTE[i].Term;
+      unsigned int query_term_length = raw_query_term[0];
       unsigned char* query_term;
       jstring j_query_term;
+      jint stem = qd->QTL->QTE[i].stem_method;
+      jlong match;
+      jlong freq;
+      /* Allocate memory to store this query term, and fill it */
+      query_term = (unsigned char*) malloc(query_term_length + 1);
+      u_char* raw_stemmed_query_term = malloc(query_term_length + 1);
+      unsigned int stemmed_query_term_length;
+      unsigned char* stemmed_query_term;
+      /* Allocate memory to store the query term, and fill it */
+      query_term = malloc(query_term_length + 1);
       assert(query_term != NULL);
       strncpy(query_term, &(qd->QTL->QTE[i].Term[1]), query_term_length);
+      strncpy(query_term, &(raw_query_term[1]), query_term_length);
       query_term[query_term_length] = '\0';
-      printf("Query term: %s\n", query_term);
       /* Allocate a new jstring for the query term */
 …
       assert(j_query_term != NULL);
+      /* Find the matching term in the term list */
+      /* Call the addTerm function (Java side) to record the query term */
+      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
+                   j_query_term, (jint) data->defaultStemMethod);
+      exc = (*j_env)->ExceptionOccurred(j_env);
+      if (exc) {
+    (*j_env)->ExceptionDescribe(j_env);
+    return;
+      }
+      /* Copy the query term, and stem it */
+      for (j = 0; j <= query_term_length; j++)
+    raw_stemmed_query_term[j] = raw_query_term[j];
+      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_query_term);
+      /* Allocate memory to store the stemmed query term, and fill it */
+      stemmed_query_term_length = raw_stemmed_query_term[0];
+      stemmed_query_term = malloc(stemmed_query_term_length + 1);
+      assert(stemmed_query_term != NULL);
+      strncpy(stemmed_query_term, &(raw_stemmed_query_term[1]), stemmed_query_term_length);
+      stemmed_query_term[stemmed_query_term_length] = '\0';
+      /* Find all the terms equivalent to the query term */
       for (j = 0; j < qd->TL->num; j++) {
+    unsigned int term_length = (unsigned int) qd->TL->TE[j].Word[0];
+    /* Stemming and case-folding mean both comparisons are necessary */
+    if ((strncmp(query_term, &(qd->TL->TE[j].Word[1]), term_length) == 0) &&
+        (strncmp(query_term, &(qd->TL->TE[j].Word[1]), query_term_length) == 0)) {
+      /* Get the document count and total frequency of the term */
+      match = qd->TL->TE[j].WE.doc_count;
+      freq = qd->TL->TE[j].WE.count;
+      /* Call the addTerm function (Java side) to record term information */
+      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
+                   j_query_term, NULL, stem, match, freq, NULL);
+    /* Check if the stemmed query term matches the stemmed term */
+    if (strcmp(stemmed_query_term, stemmed_terms[j]) == 0) {
+      u_char* raw_term = qd->TL->TE[j].Word;
+      unsigned int term_length = raw_term[0];
+      unsigned char* term;
+      jstring j_term;
+      /* Allocate memory to store the query term, and fill it */
+      term = malloc(term_length + 1);
+      assert(term != NULL);
+      strncpy(term, &(raw_term[1]), term_length);
+      term[term_length] = '\0';
+      /* Allocate a new jstring for the query term */
+      j_term = (*j_env)->NewStringUTF(j_env, term);
+      assert(j_term != NULL);
+      /* Call the addEquivTerm function (Java side) to record the equivalent term */
+      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addEquivTerm,
+                   j_query_term, j_term,
+                   (jlong) qd->TL->TE[j].WE.doc_count,
+                   (jlong) qd->TL->TE[j].WE.count);
       exc = (*j_env)->ExceptionOccurred(j_env);
       if (exc) {
 …
         return;
+      }
-      /* There can only be one match */
-      break;
+    }
+      }
-      /* Finished with this query term */
-      free(query_term);
+    }
+  }
 …
-/* Turn sorting by rank on or off */
-JNIEXPORT void JNICALL
-Java_org_greenstone_mg_MGWrapper_setSortByRank(JNIEnv *j_env, jobject j_obj,
-                                               jboolean j_on)
+{
-  MGWrapperData* data = (MGWrapperData*) (*j_env)->GetIntField(j_env, j_obj, FID_mg_data);
-  data->queryInfo->sortByRank = j_on;
+}
 /* Turn term frequency recording on or off */
 JNIEXPORT void JNICALL
 …
   /* Print the data to a character array */
   sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\norder by rank\t%d\nquery type\t%s\nmax docs\t%d\n",
+  sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\nquery type\t%s\nmax docs\t%d\n",
       (data->queryInfo->index == NULL ? "<none loaded>" : data->queryInfo->index),
       (data->defaultStemMethod & 1),
       (data->defaultStemMethod & 2),
-      (data->queryInfo->sortByRank),
       (data->defaultBoolCombine == 1 ? "all" : "some"),
       (data->queryInfo->maxDocs));

trunk/indexers/mg/jni/MGWrapperImpl.h

-              r3743
+              r3791
  - These data structures are based on the MGPP ones but are modified
    slightly to reflect the different capabilities of the MG system.
+   to reflect the different capabilities of the MG system.
  *************************************************************************/
 …
   /* Maximum number of documents to retrieve */
   unsigned long maxDocs;
-  /* Whether to sort the matching documents by weight (boolean value) */
-  int sortByRank;
   /* Whether term frequency information is desired (boolean value) */
   int needTermFreqs;

trunk/indexers/mg/jni/org_greenstone_mg_MGWrapper.h

-              r3743
+              r3791
 /*
  * Class:     org_greenstone_mg_MGWrapper
- * Method:    setSortByRank
- * Signature: (Z)V
- */
-JNIEXPORT void JNICALL Java_org_greenstone_mg_MGWrapper_setSortByRank
-  (JNIEnv *, jobject, jboolean);
-/*
- * Class:     org_greenstone_mg_MGWrapper
  * Method:    setReturnTerms
  * Signature: (Z)V

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: