Ignore:
Timestamp:
2003-03-05T13:45:43+13:00 (21 years ago)
Author:
mdewsnip
Message:

"Some" queries are now performed as ranked queries rather than boolean OR queries. Equivalent terms for each of the query terms are also recorded.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/packages/mg/jni/MGWrapperImpl.c

    r3743 r3791  
    7878jmethodID MID_addDoc = NULL; /* MGQueryResult.addDoc() */
    7979jmethodID MID_addTerm = NULL; /* MGQueryResult.addTerm() */
     80jmethodID MID_addEquivTerm = NULL; /* MGQueryResult.addEquivTerm() */
    8081jmethodID MID_setTotalDocs = NULL; /* MGQueryResult.setTotalDocs() */
    8182jmethodID MID_clearResult = NULL; /* MGQueryResult.clear() */
     
    113114  assert(MID_addDoc != NULL);
    114115
    115   /* addTerm(String term, String tag, int stem, long match, long freq, String[] equivs) */
    116   MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;Ljava/lang/String;IJJ[Ljava/lang/String;)V");
     116  /* addTerm(String term, int stem) */
     117  MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;I)V");
    117118  assert(MID_addTerm != NULL);
     119
     120  /* addEquivTerm(String term, String equivTerm, long match, long freq) */
     121  MID_addEquivTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addEquivTerm", "(Ljava/lang/String;Ljava/lang/String;JJ)V");
     122  assert(MID_addEquivTerm != NULL);
    118123
    119124  /* setTotalDocs(long) */
     
    149154  data->queryInfo->maxDocs = 50;
    150155  data->queryInfo->needTermFreqs = 1;
    151   data->queryInfo->sortByRank = 1;
    152156
    153157  /* Save the object on the Java side */
     
    291295
    292296  /* Load the appropriate index for satisfying this request */
     297  printf("Document retrieval, index path: %s\n", index_path);
    293298  qd = loadIndexData((char*) base_dir, (char*) index_path, (char*) text_path);
    294299
     
    315320  Fread (c_buffer, 1, len, qd->td->TextFile);
    316321
    317   /* Decode (?) the document text into another buffer, and terminate it */
     322  /* Decompress the document text into another buffer, and terminate it */
    318323  DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen);
    319324  uc_buffer[ULen] = '\0';
     
    335340   do a query
    336341 *******************************************/
    337 
    338 /* Comparison function for sorting documents by their weight (decreasing order) */
    339 int
    340 document_weight_comp(const void *A, const void *B)
    341 {
    342   const DocEntry *a = A;
    343   const DocEntry *b = B;
    344 
    345   /* Compare on weight, highest wins */
    346   if (a->Weight < b->Weight)
    347     return  1;
    348   if (a->Weight > b->Weight)
    349     return -1;
    350   return 0;
    351 }
    352 
    353342
    354343/* do the actual query - the results are written to query_result held on the Java side */
     
    367356  jobject result_ptr;
    368357  char* query;
    369   BooleanQueryInfo bqi;
    370358  int i, j;
    371359
     
    423411  }
    424412
    425   /* Boolean OR ("some") queries: must manually insert OR ("|") tokens */
     413  /* "Some" queries are done as ranked queries */
    426414  if (data->defaultBoolCombine == 0) {
    427     int in_space = 0;
    428     for (i = 0; i < strlen(query); i++) {
    429       if (in_space) {
    430     if (query[i] == '|') /* OR character already inserted, so remove others */
    431       query[i] = ' ';
    432     else if (!isspace(query[i])) /* moving out of a space region */
    433       in_space = 0;
    434       }
    435       else if (!in_space && isspace(query[i])) { /* moving into a space region */
    436     in_space = 1;
    437     query[i] = '|'; /* insert an OR character */
    438       }
    439     }
    440     printf("Boolean OR query string: %s\n", query);
    441   }
    442 
    443   /* If the documents need to be sorted by rank, all of them must be retrieved */
    444   bqi.MaxDocsToRetrieve = ((data->queryInfo->sortByRank) ? -1 : data->queryInfo->maxDocs);
    445 
    446   /* Perform query */
    447   /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
    448   BooleanQuery(qd, (char*) query, &bqi, data->defaultStemMethod);
     415    RankedQueryInfo rqi;
     416    rqi.QueryFreqs = 1;  /* Use the frequency of each query term in the query - OK? */
     417    rqi.Exact = 1;  /* Perform exact ranking */
     418    rqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;  /* Get only the desired number */
     419    rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;  /* OK? */
     420    rqi.Sort = 1;  /* Sort the query terms by frequency before ranking */
     421    rqi.AccumMethod = 'L';  /* Use a list when accumulating (has bugs though...) */
     422    /* rqi.MaxAccums = -1;  /* Use as many accumulators as necessary - CRASHES with list */
     423    rqi.MaxAccums = 100000;
     424    rqi.MaxTerms = -1;  /* Use all the query terms */
     425    /* rqi.StopAtMaxAccum = 0;  /* Don't care (using as many accumulators as necessary) */
     426    rqi.StopAtMaxAccum = 1;
     427    rqi.HashTblSize = 1000;  /* Don't care (not using a hash table) */
     428    rqi.skip_dump = NULL;  /* Don't dump skip information */
     429
     430    /* RankedQuery() reads 'casefold' and 'stem' parameters from the environment */
     431    SetEnv("casefold", ((data->defaultStemMethod & 1) ? "on" : "off"), NULL);
     432    SetEnv("stem", ((data->defaultStemMethod & 2) ? "on" : "off"), NULL);
     433
     434    RankedQuery(qd, query, &rqi);
     435  }
     436  /* "All" queries are done as boolean queries */
     437  else {
     438    BooleanQueryInfo bqi;
     439    bqi.MaxDocsToRetrieve = data->queryInfo->maxDocs;
     440
     441    /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */
     442    BooleanQuery(qd, query, &bqi, data->defaultStemMethod);
     443  }
    449444
    450445  /* Finished with the C query string */
     
    462457    (*j_env)->ExceptionDescribe(j_env);
    463458    return;
    464   }
    465 
    466   /* Sort the documents by rank, if desired */
    467   if (data->queryInfo->sortByRank) {
    468     qsort(qd->DL->DE, qd->DL->num, sizeof(DocEntry), document_weight_comp);
    469459  }
    470460
     
    487477  /* Record the term information, if desired */
    488478  if (data->queryInfo->needTermFreqs) {
     479    /* The following code is a lot more complicated than it could be, but it is necessary
     480       to compensate for an oddity in MG. */
     481    unsigned char** stemmed_terms = malloc(sizeof(unsigned char*) * qd->TL->num);
     482
    489483    printf("Number of terms: %d\n", qd->TL->num);
    490484    printf("Number of query terms: %d\n", qd->QTL->num);
    491485
    492     /* Find each query term in the term list, and grab its frequency */
     486    /* Generate the stemmed form of each of the relevant terms */
     487    for (i = 0; i < qd->TL->num; i++) {
     488      u_char* raw_term = qd->TL->TE[i].Word;
     489      unsigned int term_length = raw_term[0];
     490
     491      u_char* raw_stemmed_term = malloc(term_length + 1);
     492      unsigned int stemmed_term_length;
     493
     494      /* Copy the term, and stem it */
     495      for (j = 0; j <= term_length; j++)
     496    raw_stemmed_term[j] = raw_term[j];
     497      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_term);
     498
     499      /* Allocate memory to store the stemmed term, and fill it */
     500      stemmed_term_length = raw_stemmed_term[0];
     501      stemmed_terms[i] = malloc(stemmed_term_length + 1);
     502      assert(stemmed_terms[i] != NULL);
     503      strncpy(stemmed_terms[i], &(raw_stemmed_term[1]), stemmed_term_length);
     504      stemmed_terms[i][stemmed_term_length] = '\0';
     505    }
     506
     507    /* Record every query term, along with their equivalent terms */
    493508    for (i = 0; i < qd->QTL->num; i++) {
    494       unsigned int query_term_length = (unsigned int) qd->QTL->QTE[i].Term[0];
     509      u_char* raw_query_term = qd->QTL->QTE[i].Term;
     510      unsigned int query_term_length = raw_query_term[0];
    495511      unsigned char* query_term;
    496512      jstring j_query_term;
    497       jint stem = qd->QTL->QTE[i].stem_method;
    498       jlong match;
    499       jlong freq;
    500 
    501       /* Allocate memory to store this query term, and fill it */
    502       query_term = (unsigned char*) malloc(query_term_length + 1);
     513
     514      u_char* raw_stemmed_query_term = malloc(query_term_length + 1);
     515      unsigned int stemmed_query_term_length;
     516      unsigned char* stemmed_query_term;
     517
     518      /* Allocate memory to store the query term, and fill it */
     519      query_term = malloc(query_term_length + 1);
    503520      assert(query_term != NULL);
    504       strncpy(query_term, &(qd->QTL->QTE[i].Term[1]), query_term_length);
     521      strncpy(query_term, &(raw_query_term[1]), query_term_length);
    505522      query_term[query_term_length] = '\0';
    506       printf("Query term: %s\n", query_term);
    507523
    508524      /* Allocate a new jstring for the query term */
     
    510526      assert(j_query_term != NULL);
    511527
    512       /* Find the matching term in the term list */
     528      /* Call the addTerm function (Java side) to record the query term */
     529      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
     530                   j_query_term, (jint) data->defaultStemMethod);
     531      exc = (*j_env)->ExceptionOccurred(j_env);
     532      if (exc) {
     533    (*j_env)->ExceptionDescribe(j_env);
     534    return;
     535      }
     536
     537      /* Copy the query term, and stem it */
     538      for (j = 0; j <= query_term_length; j++)
     539    raw_stemmed_query_term[j] = raw_query_term[j];
     540      stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_query_term);
     541
     542      /* Allocate memory to store the stemmed query term, and fill it */
     543      stemmed_query_term_length = raw_stemmed_query_term[0];
     544      stemmed_query_term = malloc(stemmed_query_term_length + 1);
     545      assert(stemmed_query_term != NULL);
     546      strncpy(stemmed_query_term, &(raw_stemmed_query_term[1]), stemmed_query_term_length);
     547      stemmed_query_term[stemmed_query_term_length] = '\0';
     548
     549      /* Find all the terms equivalent to the query term */
    513550      for (j = 0; j < qd->TL->num; j++) {
    514     unsigned int term_length = (unsigned int) qd->TL->TE[j].Word[0];
    515 
    516     /* Stemming and case-folding mean both comparisons are necessary */
    517     if ((strncmp(query_term, &(qd->TL->TE[j].Word[1]), term_length) == 0) &&
    518         (strncmp(query_term, &(qd->TL->TE[j].Word[1]), query_term_length) == 0)) {
    519       /* Get the document count and total frequency of the term */
    520       match = qd->TL->TE[j].WE.doc_count;
    521       freq = qd->TL->TE[j].WE.count;
    522 
    523       /* Call the addTerm function (Java side) to record term information */
    524       (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm,
    525                    j_query_term, NULL, stem, match, freq, NULL);
     551    /* Check if the stemmed query term matches the stemmed term */
     552    if (strcmp(stemmed_query_term, stemmed_terms[j]) == 0) {
     553      u_char* raw_term = qd->TL->TE[j].Word;
     554      unsigned int term_length = raw_term[0];
     555      unsigned char* term;
     556      jstring j_term;
     557
     558      /* Allocate memory to store the query term, and fill it */
     559      term = malloc(term_length + 1);
     560      assert(term != NULL);
     561      strncpy(term, &(raw_term[1]), term_length);
     562      term[term_length] = '\0';
     563
     564      /* Allocate a new jstring for the query term */
     565      j_term = (*j_env)->NewStringUTF(j_env, term);
     566      assert(j_term != NULL);
     567
     568      /* Call the addEquivTerm function (Java side) to record the equivalent term */
     569      (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addEquivTerm,
     570                   j_query_term, j_term,
     571                   (jlong) qd->TL->TE[j].WE.doc_count,
     572                   (jlong) qd->TL->TE[j].WE.count);
    526573      exc = (*j_env)->ExceptionOccurred(j_env);
    527574      if (exc) {
     
    529576        return;
    530577      }
    531 
    532       /* There can only be one match */
    533       break;
    534578    }
    535579      }
    536 
    537       /* Finished with this query term */
    538       free(query_term);
    539580    }
    540581  }
     
    586627
    587628
    588 /* Turn sorting by rank on or off */
    589 JNIEXPORT void JNICALL
    590 Java_org_greenstone_mg_MGWrapper_setSortByRank(JNIEnv *j_env, jobject j_obj,
    591                                                jboolean j_on)
    592 {
    593   MGWrapperData* data = (MGWrapperData*) (*j_env)->GetIntField(j_env, j_obj, FID_mg_data); 
    594   data->queryInfo->sortByRank = j_on;
    595 }
    596 
    597 
    598629/* Turn term frequency recording on or off */
    599630JNIEXPORT void JNICALL
     
    650681
    651682  /* Print the data to a character array */
    652   sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\norder by rank\t%d\nquery type\t%s\nmax docs\t%d\n",
     683  sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\nquery type\t%s\nmax docs\t%d\n",
    653684      (data->queryInfo->index == NULL ? "<none loaded>" : data->queryInfo->index),
    654685      (data->defaultStemMethod & 1),
    655686      (data->defaultStemMethod & 2),
    656       (data->queryInfo->sortByRank),
    657687      (data->defaultBoolCombine == 1 ? "all" : "some"),
    658688      (data->queryInfo->maxDocs));
Note: See TracChangeset for help on using the changeset viewer.