Changeset 3791 for trunk/indexers/mg
- Timestamp:
- 2003-03-05T13:45:43+13:00 (21 years ago)
- Location:
- trunk/indexers/mg/jni
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/indexers/mg/jni/MGWrapperImpl.c
r3743 r3791 78 78 jmethodID MID_addDoc = NULL; /* MGQueryResult.addDoc() */ 79 79 jmethodID MID_addTerm = NULL; /* MGQueryResult.addTerm() */ 80 jmethodID MID_addEquivTerm = NULL; /* MGQueryResult.addEquivTerm() */ 80 81 jmethodID MID_setTotalDocs = NULL; /* MGQueryResult.setTotalDocs() */ 81 82 jmethodID MID_clearResult = NULL; /* MGQueryResult.clear() */ … … 113 114 assert(MID_addDoc != NULL); 114 115 115 /* addTerm(String term, String tag, int stem, long match, long freq, String[] equivs) */116 MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String; Ljava/lang/String;IJJ[Ljava/lang/String;)V");116 /* addTerm(String term, int stem) */ 117 MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;I)V"); 117 118 assert(MID_addTerm != NULL); 119 120 /* addEquivTerm(String term, String equivTerm, long match, long freq) */ 121 MID_addEquivTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addEquivTerm", "(Ljava/lang/String;Ljava/lang/String;JJ)V"); 122 assert(MID_addEquivTerm != NULL); 118 123 119 124 /* setTotalDocs(long) */ … … 149 154 data->queryInfo->maxDocs = 50; 150 155 data->queryInfo->needTermFreqs = 1; 151 data->queryInfo->sortByRank = 1;152 156 153 157 /* Save the object on the Java side */ … … 291 295 292 296 /* Load the appropriate index for satisfying this request */ 297 printf("Document retrieval, index path: %s\n", index_path); 293 298 qd = loadIndexData((char*) base_dir, (char*) index_path, (char*) text_path); 294 299 … … 315 320 Fread (c_buffer, 1, len, qd->td->TextFile); 316 321 317 /* Deco de (?)the document text into another buffer, and terminate it */322 /* Decompress the document text into another buffer, and terminate it */ 318 323 DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen); 319 324 uc_buffer[ULen] = '\0'; … … 335 340 do a query 336 341 *******************************************/ 337 338 /* Comparison function for sorting documents by their weight (decreasing order) */339 int340 document_weight_comp(const void *A, const void *B)341 {342 const DocEntry *a = A;343 const DocEntry *b = B;344 345 /* Compare on weight, highest wins */346 if (a->Weight < b->Weight)347 return 1;348 if (a->Weight > b->Weight)349 return -1;350 return 0;351 }352 353 342 354 343 /* do the actual query - the results are written to query_result held on the Java side */ … … 367 356 jobject result_ptr; 368 357 char* query; 369 BooleanQueryInfo bqi;370 358 int i, j; 371 359 … … 423 411 } 424 412 425 /* Boolean OR ("some") queries: must manually insert OR ("|") tokens */413 /* "Some" queries are done as ranked queries */ 426 414 if (data->defaultBoolCombine == 0) { 427 int in_space = 0; 428 for (i = 0; i < strlen(query); i++) { 429 if (in_space) { 430 if (query[i] == '|') /* OR character already inserted, so remove others */ 431 query[i] = ' '; 432 else if (!isspace(query[i])) /* moving out of a space region */ 433 in_space = 0; 434 } 435 else if (!in_space && isspace(query[i])) { /* moving into a space region */ 436 in_space = 1; 437 query[i] = '|'; /* insert an OR character */ 438 } 439 } 440 printf("Boolean OR query string: %s\n", query); 441 } 442 443 /* If the documents need to be sorted by rank, all of them must be retrieved */ 444 bqi.MaxDocsToRetrieve = ((data->queryInfo->sortByRank) ? -1 : data->queryInfo->maxDocs); 445 446 /* Perform query */ 447 /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */ 448 BooleanQuery(qd, (char*) query, &bqi, data->defaultStemMethod); 415 RankedQueryInfo rqi; 416 rqi.QueryFreqs = 1; /* Use the frequency of each query term in the query - OK? */ 417 rqi.Exact = 1; /* Perform exact ranking */ 418 rqi.MaxDocsToRetrieve = data->queryInfo->maxDocs; /* Get only the desired number */ 419 rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve; /* OK? */ 420 rqi.Sort = 1; /* Sort the query terms by frequency before ranking */ 421 rqi.AccumMethod = 'L'; /* Use a list when accumulating (has bugs though...) */ 422 /* rqi.MaxAccums = -1; /* Use as many accumulators as necessary - CRASHES with list */ 423 rqi.MaxAccums = 100000; 424 rqi.MaxTerms = -1; /* Use all the query terms */ 425 /* rqi.StopAtMaxAccum = 0; /* Don't care (using as many accumulators as necessary) */ 426 rqi.StopAtMaxAccum = 1; 427 rqi.HashTblSize = 1000; /* Don't care (not using a hash table) */ 428 rqi.skip_dump = NULL; /* Don't dump skip information */ 429 430 /* RankedQuery() reads 'casefold' and 'stem' parameters from the environment */ 431 SetEnv("casefold", ((data->defaultStemMethod & 1) ? "on" : "off"), NULL); 432 SetEnv("stem", ((data->defaultStemMethod & 2) ? "on" : "off"), NULL); 433 434 RankedQuery(qd, query, &rqi); 435 } 436 /* "All" queries are done as boolean queries */ 437 else { 438 BooleanQueryInfo bqi; 439 bqi.MaxDocsToRetrieve = data->queryInfo->maxDocs; 440 441 /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */ 442 BooleanQuery(qd, query, &bqi, data->defaultStemMethod); 443 } 449 444 450 445 /* Finished with the C query string */ … … 462 457 (*j_env)->ExceptionDescribe(j_env); 463 458 return; 464 }465 466 /* Sort the documents by rank, if desired */467 if (data->queryInfo->sortByRank) {468 qsort(qd->DL->DE, qd->DL->num, sizeof(DocEntry), document_weight_comp);469 459 } 470 460 … … 487 477 /* Record the term information, if desired */ 488 478 if (data->queryInfo->needTermFreqs) { 479 /* The following code is a lot more complicated than it could be, but it is necessary 480 to compensate for an oddity in MG. */ 481 unsigned char** stemmed_terms = malloc(sizeof(unsigned char*) * qd->TL->num); 482 489 483 printf("Number of terms: %d\n", qd->TL->num); 490 484 printf("Number of query terms: %d\n", qd->QTL->num); 491 485 492 /* Find each query term in the term list, and grab its frequency */ 486 /* Generate the stemmed form of each of the relevant terms */ 487 for (i = 0; i < qd->TL->num; i++) { 488 u_char* raw_term = qd->TL->TE[i].Word; 489 unsigned int term_length = raw_term[0]; 490 491 u_char* raw_stemmed_term = malloc(term_length + 1); 492 unsigned int stemmed_term_length; 493 494 /* Copy the term, and stem it */ 495 for (j = 0; j <= term_length; j++) 496 raw_stemmed_term[j] = raw_term[j]; 497 stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_term); 498 499 /* Allocate memory to store the stemmed term, and fill it */ 500 stemmed_term_length = raw_stemmed_term[0]; 501 stemmed_terms[i] = malloc(stemmed_term_length + 1); 502 assert(stemmed_terms[i] != NULL); 503 strncpy(stemmed_terms[i], &(raw_stemmed_term[1]), stemmed_term_length); 504 stemmed_terms[i][stemmed_term_length] = '\0'; 505 } 506 507 /* Record every query term, along with their equivalent terms */ 493 508 for (i = 0; i < qd->QTL->num; i++) { 494 unsigned int query_term_length = (unsigned int) qd->QTL->QTE[i].Term[0]; 509 u_char* raw_query_term = qd->QTL->QTE[i].Term; 510 unsigned int query_term_length = raw_query_term[0]; 495 511 unsigned char* query_term; 496 512 jstring j_query_term; 497 jint stem = qd->QTL->QTE[i].stem_method; 498 jlong match; 499 jlong freq; 500 501 /* Allocate memory to store this query term, and fill it */ 502 query_term = (unsigned char*) malloc(query_term_length + 1); 513 514 u_char* raw_stemmed_query_term = malloc(query_term_length + 1); 515 unsigned int stemmed_query_term_length; 516 unsigned char* stemmed_query_term; 517 518 /* Allocate memory to store the query term, and fill it */ 519 query_term = malloc(query_term_length + 1); 503 520 assert(query_term != NULL); 504 strncpy(query_term, &( qd->QTL->QTE[i].Term[1]), query_term_length);521 strncpy(query_term, &(raw_query_term[1]), query_term_length); 505 522 query_term[query_term_length] = '\0'; 506 printf("Query term: %s\n", query_term);507 523 508 524 /* Allocate a new jstring for the query term */ … … 510 526 assert(j_query_term != NULL); 511 527 512 /* Find the matching term in the term list */ 528 /* Call the addTerm function (Java side) to record the query term */ 529 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm, 530 j_query_term, (jint) data->defaultStemMethod); 531 exc = (*j_env)->ExceptionOccurred(j_env); 532 if (exc) { 533 (*j_env)->ExceptionDescribe(j_env); 534 return; 535 } 536 537 /* Copy the query term, and stem it */ 538 for (j = 0; j <= query_term_length; j++) 539 raw_stemmed_query_term[j] = raw_query_term[j]; 540 stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_query_term); 541 542 /* Allocate memory to store the stemmed query term, and fill it */ 543 stemmed_query_term_length = raw_stemmed_query_term[0]; 544 stemmed_query_term = malloc(stemmed_query_term_length + 1); 545 assert(stemmed_query_term != NULL); 546 strncpy(stemmed_query_term, &(raw_stemmed_query_term[1]), stemmed_query_term_length); 547 stemmed_query_term[stemmed_query_term_length] = '\0'; 548 549 /* Find all the terms equivalent to the query term */ 513 550 for (j = 0; j < qd->TL->num; j++) { 514 unsigned int term_length = (unsigned int) qd->TL->TE[j].Word[0]; 515 516 /* Stemming and case-folding mean both comparisons are necessary */ 517 if ((strncmp(query_term, &(qd->TL->TE[j].Word[1]), term_length) == 0) && 518 (strncmp(query_term, &(qd->TL->TE[j].Word[1]), query_term_length) == 0)) { 519 /* Get the document count and total frequency of the term */ 520 match = qd->TL->TE[j].WE.doc_count; 521 freq = qd->TL->TE[j].WE.count; 522 523 /* Call the addTerm function (Java side) to record term information */ 524 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm, 525 j_query_term, NULL, stem, match, freq, NULL); 551 /* Check if the stemmed query term matches the stemmed term */ 552 if (strcmp(stemmed_query_term, stemmed_terms[j]) == 0) { 553 u_char* raw_term = qd->TL->TE[j].Word; 554 unsigned int term_length = raw_term[0]; 555 unsigned char* term; 556 jstring j_term; 557 558 /* Allocate memory to store the query term, and fill it */ 559 term = malloc(term_length + 1); 560 assert(term != NULL); 561 strncpy(term, &(raw_term[1]), term_length); 562 term[term_length] = '\0'; 563 564 /* Allocate a new jstring for the query term */ 565 j_term = (*j_env)->NewStringUTF(j_env, term); 566 assert(j_term != NULL); 567 568 /* Call the addEquivTerm function (Java side) to record the equivalent term */ 569 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addEquivTerm, 570 j_query_term, j_term, 571 (jlong) qd->TL->TE[j].WE.doc_count, 572 (jlong) qd->TL->TE[j].WE.count); 526 573 exc = (*j_env)->ExceptionOccurred(j_env); 527 574 if (exc) { … … 529 576 return; 530 577 } 531 532 /* There can only be one match */533 break;534 578 } 535 579 } 536 537 /* Finished with this query term */538 free(query_term);539 580 } 540 581 } … … 586 627 587 628 588 /* Turn sorting by rank on or off */589 JNIEXPORT void JNICALL590 Java_org_greenstone_mg_MGWrapper_setSortByRank(JNIEnv *j_env, jobject j_obj,591 jboolean j_on)592 {593 MGWrapperData* data = (MGWrapperData*) (*j_env)->GetIntField(j_env, j_obj, FID_mg_data);594 data->queryInfo->sortByRank = j_on;595 }596 597 598 629 /* Turn term frequency recording on or off */ 599 630 JNIEXPORT void JNICALL … … 650 681 651 682 /* Print the data to a character array */ 652 sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\n order by rank\t%d\nquery type\t%s\nmax docs\t%d\n",683 sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\nquery type\t%s\nmax docs\t%d\n", 653 684 (data->queryInfo->index == NULL ? "<none loaded>" : data->queryInfo->index), 654 685 (data->defaultStemMethod & 1), 655 686 (data->defaultStemMethod & 2), 656 (data->queryInfo->sortByRank),657 687 (data->defaultBoolCombine == 1 ? "all" : "some"), 658 688 (data->queryInfo->maxDocs)); -
trunk/indexers/mg/jni/MGWrapperImpl.h
r3743 r3791 26 26 27 27 - These data structures are based on the MGPP ones but are modified 28 slightlyto reflect the different capabilities of the MG system.28 to reflect the different capabilities of the MG system. 29 29 30 30 *************************************************************************/ … … 37 37 /* Maximum number of documents to retrieve */ 38 38 unsigned long maxDocs; 39 /* Whether to sort the matching documents by weight (boolean value) */40 int sortByRank;41 42 39 /* Whether term frequency information is desired (boolean value) */ 43 40 int needTermFreqs; -
trunk/indexers/mg/jni/org_greenstone_mg_MGWrapper.h
r3743 r3791 66 66 /* 67 67 * Class: org_greenstone_mg_MGWrapper 68 * Method: setSortByRank69 * Signature: (Z)V70 */71 JNIEXPORT void JNICALL Java_org_greenstone_mg_MGWrapper_setSortByRank72 (JNIEnv *, jobject, jboolean);73 74 /*75 * Class: org_greenstone_mg_MGWrapper76 68 * Method: setReturnTerms 77 69 * Signature: (Z)V
Note:
See TracChangeset
for help on using the changeset viewer.