Changeset 3791 for trunk/indexers/mg


Ignore:
Timestamp:
2003-03-05T13:45:43+13:00 (21 years ago)
Author:
mdewsnip
Message:

"Some" queries are now performed as ranked queries rather than boolean OR queries. Equivalent terms for each of the query terms are also recorded.

Location:
trunk/indexers/mg/jni
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mg/jni/MGWrapperImpl.c

    r3743 r3791  
    7878jmethodID MID_addDoc = NULL; /* MGQueryResult.addDoc() */
    7979jmethodID MID_addTerm = NULL; /* MGQueryResult.addTerm() */
     80jmethodID MID_addEquivTerm = NULL; /* MGQueryResult.addEquivTerm() */
    8081jmethodID MID_setTotalDocs = NULL; /* MGQueryResult.setTotalDocs() */
    8182jmethodID MID_clearResult = NULL; /* MGQueryResult.clear() */
     
    113114  assert(MID_addDoc != NULL);
    114115
    115   /* addTerm(String term, String tag, int stem, long match, long freq, String[] equivs) */
    116   MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;Ljava/lang/String;IJJ[Ljava/lang/String;)V");
     116  /* addTerm(String term, int stem) */
     117  MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;I)V");
    117118  assert(MID_addTerm != NULL);
     119
     120  /* addEquivTerm(String term, String equivTerm, long match, long freq) */
     121  MID_addEquivTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addEquivTerm", "(Ljava/lang/String;Ljava/lang/String;JJ)V");
     122  assert(MID_addEquivTerm != NULL);
    118123
    119124  /* setTotalDocs(long) */
     
    149154  data->queryInfo->maxDocs = 50;
    150155  data->queryInfo->needTermFreqs = 1;
    151   data->queryInfo->sortByRank = 1;
    152156
    153157  /* Save the object on the Java side */
     
    291295
    292296  /* Load the appropriate index for satisfying this request */
     297  printf("Document retrieval, index path: %s\n", index_path);
    293298  qd = loadIndexData((char*) base_dir, (char*) index_path, (char*) text_path);
    294299
     
    315320  Fread (c_buffer, 1, len, qd->td->TextFile);
    316321
    317   /* Decode (?) the document text into another buffer, and terminate it */
     322  /* Decompress the document text into another buffer, and terminate it */
    318323  DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen);
    319324  uc_buffer[ULen] = '\0';
     
    335340   do a query
    336341 *******************************************/
    337 
    338 /* Comparison function for sorting documents by their weight (decreasing order) */
    339 int
    340 document_weight_comp(const void *A, const void *B)
    341 {
    342   const DocEntry *a = A;
    343   const DocEntry *b = B;
    344 
    345   /* Compare on weight, highest wins */
    346   if (a->Weight < b->Weight)
    347     return  1;
    348   if (a->Weight > b->Weight)
    349     return -1;
    350   return 0;
    351 }
    352 
    353342
    354343/* do the actual query - the results are written to query_result held on the Java side */
     
    367356  jobject result_ptr;
    368357  char* query;
    369   BooleanQueryInfo bqi;
    370358  int i, j;
    371359
     
    423411  }
    424412
    425   /* Boolean OR ("some") queries: must manually insert OR ("|") tokens */
     413  /* "Some" queries are done as ranked queries */
    426414  if (data->defaultBoolCombine == 0) {
    427     int in_space = 0;
    428     for (i = 0; i < strlen(query); i++) {
    429       if (in_space) {
    430     if (query[i] == '|') /* OR character already inserted, so remove others */
    431       query[i] = ' ';
    432     else if (!isspace(query[i])) /* moving out of a space region */
    433       in_space = 0;
    434       }
    435       else if (!in_space && isspace(query[i])) { /* moving into a space region */
    436     in_space = 1;
    437     query[i] = '|'; /* insert an OR character */
    438       }
    439     }
    440     printf("Boolean OR query string: %s\n", query);
    441   }
    442 
    443   /* If the documents need to be sorted by rank, all of them must be retrieved */
    444   bqi.MaxDocsToRetrieve = ((data->queryInfo->sortByRank) ? -1 : data->queryInfo->maxDocs);
    445 
    446   /* Perform query */
    447   /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
    448   BooleanQuery(qd, (char*) query, &bqi, data->defaultStemMethod);
     415    RankedQueryInfo rqi;
     416    rqi.QueryFreqs = 1;  /* Use the frequency of each query term in the query - OK? */
     417    rqi.Exact = 1;  /* Perform exact ranking */
     418    rqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;  /* Get only the desired number */
     419    rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;  /* OK? */
     420    rqi.Sort = 1;  /* Sort the query terms by frequency before ranking */
     421    rqi.AccumMethod = 'L';  /* Use a list when accumulating (has bugs though...) */
     422    /* rqi.MaxAccums = -1;  /* Use as many accumulators as necessary - CRASHES with list */
     423    rqi.MaxAccums = 100000;
     424    rqi.MaxTerms = -1;  /* Use all the query terms */
     425    /* rqi.StopAtMaxAccum = 0;  /* Don't care (using as many accumulators as necessary) */
     426    rqi.StopAtMaxAccum = 1;
     427    rqi.HashTblSize = 1000;  /* Don't care (not using a hash table) */
     428    rqi.skip_dump = NULL;  /* Don't dump skip information */
     429
     430    /* RankedQuery() reads 'casefold' and 'stem' parameters from the environment */
     431    SetEnv("casefold", ((data->defaultStemMethod & 1) ? "on" : "off"), NULL);
     432    SetEnv("stem", ((data->defaultStemMethod & 2) ? "on" : "off"), NULL);
     433
     434    RankedQuery(qd, query, &rqi);
     435  }
     436  /* "All" queries are done as boolean queries */
     437  else {
     438    BooleanQueryInfo bqi;
     439    bqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;
     440
     441    /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
     442    BooleanQuery(qd, query, &bqi, data->defaultStemMethod);
     443  }
    449444
    450445  /* Finished with the C query string */
     
    462457    (*j_env)->ExceptionDescribe(j_env);
    463458    return;
    464   }
    465 
    466   /* Sort the documents by rank, if desired */
    467   if (data->queryInfo->sortByRank) {
    468     qsort(qd->DL->DE, qd->DL->num, sizeof(DocEntry), document_weight_comp);
    469459  }
    470460
     
    487477  /* Record the term information, if desired */
    488478  if (data->queryInfo->needTermFreqs) {
     479    /* The following code is a lot more complicated than it could be, but it is necessary
     480       to compensate for an oddity in MG. */
     481    unsigned char** stemmed_terms = malloc(sizeof(unsigned char*) * qd->TL->num);
     482
    489483    printf("Number of terms: %d\n", qd->TL->num);
    490484    printf("Number of query terms: %d\n", qd->QTL->num);
    491485
    492     /* Find each query term in the term list, and grab its frequency */
     486    /* Generate the stemmed form of each of the relevant terms */
     487    for (i = 0; i < qd->TL->num; i++) {
     488      u_char* raw_term = qd->TL->TE[i].Word;
     489      unsigned int term_length = raw_term[0];
     490
     491      u_char* raw_stemmed_term = malloc(term_length + 1);
     492      unsigned int stemmed_term_length;
     493
     494      /* Copy the term, and stem it */
     495      for (j = 0; j <= term_length; j++)
     496    raw_stemmed_term[j] = raw_term[j];
     497      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_term);
     498
     499      /* Allocate memory to store the stemmed term, and fill it */
     500      stemmed_term_length = raw_stemmed_term[0];
     501      stemmed_terms[i] = malloc(stemmed_term_length + 1);
     502      assert(stemmed_terms[i] != NULL);
     503      strncpy(stemmed_terms[i], &(raw_stemmed_term[1]), stemmed_term_length);
     504      stemmed_terms[i][stemmed_term_length] = '\0';
     505    }
     506
     507    /* Record every query term, along with their equivalent terms */
    493508    for (i = 0; i < qd->QTL->num; i++) {
    494       unsigned int query_term_length = (unsigned int) qd->QTL->QTE[i].Term[0];
     509      u_char* raw_query_term = qd->QTL->QTE[i].Term;
     510      unsigned int query_term_length = raw_query_term[0];
    495511      unsigned char* query_term;
    496512      jstring j_query_term;
    497       jint stem = qd->QTL->QTE[i].stem_method;
    498       jlong match;
    499       jlong freq;
    500 
    501       /* Allocate memory to store this query term, and fill it */
    502       query_term = (unsigned char*) malloc(query_term_length + 1);
     513
     514      u_char* raw_stemmed_query_term = malloc(query_term_length + 1);
     515      unsigned int stemmed_query_term_length;
     516      unsigned char* stemmed_query_term;
     517
     518      /* Allocate memory to store the query term, and fill it */
     519      query_term = malloc(query_term_length + 1);
    503520      assert(query_term != NULL);
    504       strncpy(query_term, &(qd->QTL->QTE[i].Term[1]), query_term_length);
     521      strncpy(query_term, &(raw_query_term[1]), query_term_length);
    505522      query_term[query_term_length] = '\0';
    506       printf("Query term: %s\n", query_term);
    507523
    508524      /* Allocate a new jstring for the query term */
     
    510526      assert(j_query_term != NULL);
    511527
    512       /* Find the matching term in the term list */
     528      /* Call the addTerm function (Java side) to record the query term */
     529      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
     530                   j_query_term, (jint) data->defaultStemMethod);
     531      exc = (*j_env)->ExceptionOccurred(j_env);
     532      if (exc) {
     533    (*j_env)->ExceptionDescribe(j_env);
     534    return;
     535      }
     536
     537      /* Copy the query term, and stem it */
     538      for (j = 0; j <= query_term_length; j++)
     539    raw_stemmed_query_term[j] = raw_query_term[j];
     540      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_query_term);
     541
     542      /* Allocate memory to store the stemmed query term, and fill it */
     543      stemmed_query_term_length = raw_stemmed_query_term[0];
     544      stemmed_query_term = malloc(stemmed_query_term_length + 1);
     545      assert(stemmed_query_term != NULL);
     546      strncpy(stemmed_query_term, &(raw_stemmed_query_term[1]), stemmed_query_term_length);
     547      stemmed_query_term[stemmed_query_term_length] = '\0';
     548
     549      /* Find all the terms equivalent to the query term */
    513550      for (j = 0; j < qd->TL->num; j++) {
    514     unsigned int term_length = (unsigned int) qd->TL->TE[j].Word[0];
    515 
    516     /* Stemming and case-folding mean both comparisons are necessary */
    517     if ((strncmp(query_term, &(qd->TL->TE[j].Word[1]), term_length) == 0) &&
    518         (strncmp(query_term, &(qd->TL->TE[j].Word[1]), query_term_length) == 0)) {
    519       /* Get the document count and total frequency of the term */
    520       match = qd->TL->TE[j].WE.doc_count;
    521       freq = qd->TL->TE[j].WE.count;
    522 
    523       /* Call the addTerm function (Java side) to record term information */
    524       (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
    525                    j_query_term, NULL, stem, match, freq, NULL);
     551    /* Check if the stemmed query term matches the stemmed term */
     552    if (strcmp(stemmed_query_term, stemmed_terms[j]) == 0) {
     553      u_char* raw_term = qd->TL->TE[j].Word;
     554      unsigned int term_length = raw_term[0];
     555      unsigned char* term;
     556      jstring j_term;
     557
     558      /* Allocate memory to store the query term, and fill it */
     559      term = malloc(term_length + 1);
     560      assert(term != NULL);
     561      strncpy(term, &(raw_term[1]), term_length);
     562      term[term_length] = '\0';
     563
     564      /* Allocate a new jstring for the query term */
     565      j_term = (*j_env)->NewStringUTF(j_env, term);
     566      assert(j_term != NULL);
     567
     568      /* Call the addEquivTerm function (Java side) to record the equivalent term */
     569      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addEquivTerm,
     570                   j_query_term, j_term,
     571                   (jlong) qd->TL->TE[j].WE.doc_count,
     572                   (jlong) qd->TL->TE[j].WE.count);
    526573      exc = (*j_env)->ExceptionOccurred(j_env);
    527574      if (exc) {
     
    529576        return;
    530577      }
    531 
    532       /* There can only be one match */
    533       break;
    534578    }
    535579      }
    536 
    537       /* Finished with this query term */
    538       free(query_term);
    539580    }
    540581  }
     
    586627
    587628
    588 /* Turn sorting by rank on or off */
    589 JNIEXPORT void JNICALL
    590 Java_org_greenstone_mg_MGWrapper_setSortByRank(JNIEnv *j_env, jobject j_obj,
    591                                                jboolean j_on)
    592 {
    593   MGWrapperData* data = (MGWrapperData*) (*j_env)->GetIntField(j_env, j_obj, FID_mg_data); 
    594   data->queryInfo->sortByRank = j_on;
    595 }
    596 
    597 
    598629/* Turn term frequency recording on or off */
    599630JNIEXPORT void JNICALL
     
    650681
    651682  /* Print the data to a character array */
    652   sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\norder by rank\t%d\nquery type\t%s\nmax docs\t%d\n",
     683  sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\nquery type\t%s\nmax docs\t%d\n",
    653684      (data->queryInfo->index == NULL ? "<none loaded>" : data->queryInfo->index),
    654685      (data->defaultStemMethod & 1),
    655686      (data->defaultStemMethod & 2),
    656       (data->queryInfo->sortByRank),
    657687      (data->defaultBoolCombine == 1 ? "all" : "some"),
    658688      (data->queryInfo->maxDocs));
  • trunk/indexers/mg/jni/MGWrapperImpl.h

    r3743 r3791  
    2626
    2727 - These data structures are based on the MGPP ones but are modified
    28    slightly to reflect the different capabilities of the MG system.
     28   to reflect the different capabilities of the MG system.
    2929
    3030 *************************************************************************/
     
    3737  /* Maximum number of documents to retrieve */
    3838  unsigned long maxDocs;
    39   /* Whether to sort the matching documents by weight (boolean value) */
    40   int sortByRank;
    41 
    4239  /* Whether term frequency information is desired (boolean value) */
    4340  int needTermFreqs;
  • trunk/indexers/mg/jni/org_greenstone_mg_MGWrapper.h

    r3743 r3791  
    6666/*
    6767 * Class:     org_greenstone_mg_MGWrapper
    68  * Method:    setSortByRank
    69  * Signature: (Z)V
    70  */
    71 JNIEXPORT void JNICALL Java_org_greenstone_mg_MGWrapper_setSortByRank
    72   (JNIEnv *, jobject, jboolean);
    73 
    74 /*
    75  * Class:     org_greenstone_mg_MGWrapper
    7668 * Method:    setReturnTerms
    7769 * Signature: (Z)V
Note: See TracChangeset for help on using the changeset viewer.