Changeset 3791 for trunk/gsdl3/src/packages/mg/jni/MGWrapperImpl.c
- Timestamp:
- 2003-03-05T13:45:43+13:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/packages/mg/jni/MGWrapperImpl.c
r3743 r3791 78 78 jmethodID MID_addDoc = NULL; /* MGQueryResult.addDoc() */ 79 79 jmethodID MID_addTerm = NULL; /* MGQueryResult.addTerm() */ 80 jmethodID MID_addEquivTerm = NULL; /* MGQueryResult.addEquivTerm() */ 80 81 jmethodID MID_setTotalDocs = NULL; /* MGQueryResult.setTotalDocs() */ 81 82 jmethodID MID_clearResult = NULL; /* MGQueryResult.clear() */ … … 113 114 assert(MID_addDoc != NULL); 114 115 115 /* addTerm(String term, String tag, int stem, long match, long freq, String[] equivs) */116 MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String; Ljava/lang/String;IJJ[Ljava/lang/String;)V");116 /* addTerm(String term, int stem) */ 117 MID_addTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addTerm", "(Ljava/lang/String;I)V"); 117 118 assert(MID_addTerm != NULL); 119 120 /* addEquivTerm(String term, String equivTerm, long match, long freq) */ 121 MID_addEquivTerm = (*j_env)->GetMethodID(j_env, JC_MGQueryResult, "addEquivTerm", "(Ljava/lang/String;Ljava/lang/String;JJ)V"); 122 assert(MID_addEquivTerm != NULL); 118 123 119 124 /* setTotalDocs(long) */ … … 149 154 data->queryInfo->maxDocs = 50; 150 155 data->queryInfo->needTermFreqs = 1; 151 data->queryInfo->sortByRank = 1;152 156 153 157 /* Save the object on the Java side */ … … 291 295 292 296 /* Load the appropriate index for satisfying this request */ 297 printf("Document retrieval, index path: %s\n", index_path); 293 298 qd = loadIndexData((char*) base_dir, (char*) index_path, (char*) text_path); 294 299 … … 315 320 Fread (c_buffer, 1, len, qd->td->TextFile); 316 321 317 /* Deco de (?)the document text into another buffer, and terminate it */322 /* Decompress the document text into another buffer, and terminate it */ 318 323 DecodeText (qd->cd, c_buffer, len, uc_buffer, &ULen); 319 324 uc_buffer[ULen] = '\0'; … … 335 340 do a query 336 341 *******************************************/ 337 338 /* Comparison function for sorting documents by their weight (decreasing order) */339 int340 document_weight_comp(const void *A, const void *B)341 {342 const DocEntry *a = A;343 const DocEntry *b = B;344 345 /* Compare on weight, highest wins */346 if (a->Weight < b->Weight)347 return 1;348 if (a->Weight > b->Weight)349 return -1;350 return 0;351 }352 353 342 354 343 /* do the actual query - the results are written to query_result held on the Java side */ … … 367 356 jobject result_ptr; 368 357 char* query; 369 BooleanQueryInfo bqi;370 358 int i, j; 371 359 … … 423 411 } 424 412 425 /* Boolean OR ("some") queries: must manually insert OR ("|") tokens */413 /* "Some" queries are done as ranked queries */ 426 414 if (data->defaultBoolCombine == 0) { 427 int in_space = 0; 428 for (i = 0; i < strlen(query); i++) { 429 if (in_space) { 430 if (query[i] == '|') /* OR character already inserted, so remove others */ 431 query[i] = ' '; 432 else if (!isspace(query[i])) /* moving out of a space region */ 433 in_space = 0; 434 } 435 else if (!in_space && isspace(query[i])) { /* moving into a space region */ 436 in_space = 1; 437 query[i] = '|'; /* insert an OR character */ 438 } 439 } 440 printf("Boolean OR query string: %s\n", query); 441 } 442 443 /* If the documents need to be sorted by rank, all of them must be retrieved */ 444 bqi.MaxDocsToRetrieve = ((data->queryInfo->sortByRank) ? -1 : data->queryInfo->maxDocs); 445 446 /* Perform query */ 447 /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */ 448 BooleanQuery(qd, (char*) query, &bqi, data->defaultStemMethod); 415 RankedQueryInfo rqi; 416 rqi.QueryFreqs = 1; /* Use the frequency of each query term in the query - OK? */ 417 rqi.Exact = 1; /* Perform exact ranking */ 418 rqi.MaxDocsToRetrieve = data->queryInfo->maxDocs; /* Get only the desired number */ 419 rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve; /* OK? */ 420 rqi.Sort = 1; /* Sort the query terms by frequency before ranking */ 421 rqi.AccumMethod = 'L'; /* Use a list when accumulating (has bugs though...) */ 422 /* rqi.MaxAccums = -1; /* Use as many accumulators as necessary - CRASHES with list */ 423 rqi.MaxAccums = 100000; 424 rqi.MaxTerms = -1; /* Use all the query terms */ 425 /* rqi.StopAtMaxAccum = 0; /* Don't care (using as many accumulators as necessary) */ 426 rqi.StopAtMaxAccum = 1; 427 rqi.HashTblSize = 1000; /* Don't care (not using a hash table) */ 428 rqi.skip_dump = NULL; /* Don't dump skip information */ 429 430 /* RankedQuery() reads 'casefold' and 'stem' parameters from the environment */ 431 SetEnv("casefold", ((data->defaultStemMethod & 1) ? "on" : "off"), NULL); 432 SetEnv("stem", ((data->defaultStemMethod & 2) ? "on" : "off"), NULL); 433 434 RankedQuery(qd, query, &rqi); 435 } 436 /* "All" queries are done as boolean queries */ 437 else { 438 BooleanQueryInfo bqi; 439 bqi.MaxDocsToRetrieve = data->queryInfo->maxDocs; 440 441 /* Had to add "words$o" to LIB_OBJS in mg/src/text/Makefile and recompile mg for this */ 442 BooleanQuery(qd, query, &bqi, data->defaultStemMethod); 443 } 449 444 450 445 /* Finished with the C query string */ … … 462 457 (*j_env)->ExceptionDescribe(j_env); 463 458 return; 464 }465 466 /* Sort the documents by rank, if desired */467 if (data->queryInfo->sortByRank) {468 qsort(qd->DL->DE, qd->DL->num, sizeof(DocEntry), document_weight_comp);469 459 } 470 460 … … 487 477 /* Record the term information, if desired */ 488 478 if (data->queryInfo->needTermFreqs) { 479 /* The following code is a lot more complicated than it could be, but it is necessary 480 to compensate for an oddity in MG. */ 481 unsigned char** stemmed_terms = malloc(sizeof(unsigned char*) * qd->TL->num); 482 489 483 printf("Number of terms: %d\n", qd->TL->num); 490 484 printf("Number of query terms: %d\n", qd->QTL->num); 491 485 492 /* Find each query term in the term list, and grab its frequency */ 486 /* Generate the stemmed form of each of the relevant terms */ 487 for (i = 0; i < qd->TL->num; i++) { 488 u_char* raw_term = qd->TL->TE[i].Word; 489 unsigned int term_length = raw_term[0]; 490 491 u_char* raw_stemmed_term = malloc(term_length + 1); 492 unsigned int stemmed_term_length; 493 494 /* Copy the term, and stem it */ 495 for (j = 0; j <= term_length; j++) 496 raw_stemmed_term[j] = raw_term[j]; 497 stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_term); 498 499 /* Allocate memory to store the stemmed term, and fill it */ 500 stemmed_term_length = raw_stemmed_term[0]; 501 stemmed_terms[i] = malloc(stemmed_term_length + 1); 502 assert(stemmed_terms[i] != NULL); 503 strncpy(stemmed_terms[i], &(raw_stemmed_term[1]), stemmed_term_length); 504 stemmed_terms[i][stemmed_term_length] = '\0'; 505 } 506 507 /* Record every query term, along with their equivalent terms */ 493 508 for (i = 0; i < qd->QTL->num; i++) { 494 unsigned int query_term_length = (unsigned int) qd->QTL->QTE[i].Term[0]; 509 u_char* raw_query_term = qd->QTL->QTE[i].Term; 510 unsigned int query_term_length = raw_query_term[0]; 495 511 unsigned char* query_term; 496 512 jstring j_query_term; 497 jint stem = qd->QTL->QTE[i].stem_method; 498 jlong match; 499 jlong freq; 500 501 /* Allocate memory to store this query term, and fill it */ 502 query_term = (unsigned char*) malloc(query_term_length + 1); 513 514 u_char* raw_stemmed_query_term = malloc(query_term_length + 1); 515 unsigned int stemmed_query_term_length; 516 unsigned char* stemmed_query_term; 517 518 /* Allocate memory to store the query term, and fill it */ 519 query_term = malloc(query_term_length + 1); 503 520 assert(query_term != NULL); 504 strncpy(query_term, &( qd->QTL->QTE[i].Term[1]), query_term_length);521 strncpy(query_term, &(raw_query_term[1]), query_term_length); 505 522 query_term[query_term_length] = '\0'; 506 printf("Query term: %s\n", query_term);507 523 508 524 /* Allocate a new jstring for the query term */ … … 510 526 assert(j_query_term != NULL); 511 527 512 /* Find the matching term in the term list */ 528 /* Call the addTerm function (Java side) to record the query term */ 529 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm, 530 j_query_term, (jint) data->defaultStemMethod); 531 exc = (*j_env)->ExceptionOccurred(j_env); 532 if (exc) { 533 (*j_env)->ExceptionDescribe(j_env); 534 return; 535 } 536 537 /* Copy the query term, and stem it */ 538 for (j = 0; j <= query_term_length; j++) 539 raw_stemmed_query_term[j] = raw_query_term[j]; 540 stemmer(data->defaultStemMethod, qd->sd->sdh.stemmer_num, raw_stemmed_query_term); 541 542 /* Allocate memory to store the stemmed query term, and fill it */ 543 stemmed_query_term_length = raw_stemmed_query_term[0]; 544 stemmed_query_term = malloc(stemmed_query_term_length + 1); 545 assert(stemmed_query_term != NULL); 546 strncpy(stemmed_query_term, &(raw_stemmed_query_term[1]), stemmed_query_term_length); 547 stemmed_query_term[stemmed_query_term_length] = '\0'; 548 549 /* Find all the terms equivalent to the query term */ 513 550 for (j = 0; j < qd->TL->num; j++) { 514 unsigned int term_length = (unsigned int) qd->TL->TE[j].Word[0]; 515 516 /* Stemming and case-folding mean both comparisons are necessary */ 517 if ((strncmp(query_term, &(qd->TL->TE[j].Word[1]), term_length) == 0) && 518 (strncmp(query_term, &(qd->TL->TE[j].Word[1]), query_term_length) == 0)) { 519 /* Get the document count and total frequency of the term */ 520 match = qd->TL->TE[j].WE.doc_count; 521 freq = qd->TL->TE[j].WE.count; 522 523 /* Call the addTerm function (Java side) to record term information */ 524 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addTerm, 525 j_query_term, NULL, stem, match, freq, NULL); 551 /* Check if the stemmed query term matches the stemmed term */ 552 if (strcmp(stemmed_query_term, stemmed_terms[j]) == 0) { 553 u_char* raw_term = qd->TL->TE[j].Word; 554 unsigned int term_length = raw_term[0]; 555 unsigned char* term; 556 jstring j_term; 557 558 /* Allocate memory to store the query term, and fill it */ 559 term = malloc(term_length + 1); 560 assert(term != NULL); 561 strncpy(term, &(raw_term[1]), term_length); 562 term[term_length] = '\0'; 563 564 /* Allocate a new jstring for the query term */ 565 j_term = (*j_env)->NewStringUTF(j_env, term); 566 assert(j_term != NULL); 567 568 /* Call the addEquivTerm function (Java side) to record the equivalent term */ 569 (*j_env)->CallVoidMethod(j_env, result_ptr, MID_addEquivTerm, 570 j_query_term, j_term, 571 (jlong) qd->TL->TE[j].WE.doc_count, 572 (jlong) qd->TL->TE[j].WE.count); 526 573 exc = (*j_env)->ExceptionOccurred(j_env); 527 574 if (exc) { … … 529 576 return; 530 577 } 531 532 /* There can only be one match */533 break;534 578 } 535 579 } 536 537 /* Finished with this query term */538 free(query_term);539 580 } 540 581 } … … 586 627 587 628 588 /* Turn sorting by rank on or off */589 JNIEXPORT void JNICALL590 Java_org_greenstone_mg_MGWrapper_setSortByRank(JNIEnv *j_env, jobject j_obj,591 jboolean j_on)592 {593 MGWrapperData* data = (MGWrapperData*) (*j_env)->GetIntField(j_env, j_obj, FID_mg_data);594 data->queryInfo->sortByRank = j_on;595 }596 597 598 629 /* Turn term frequency recording on or off */ 599 630 JNIEXPORT void JNICALL … … 650 681 651 682 /* Print the data to a character array */ 652 sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\n order by rank\t%d\nquery type\t%s\nmax docs\t%d\n",683 sprintf(result, "Query params:\nindex\t\t%s\ncasefold\t%d\nstem\t\t%d\nquery type\t%s\nmax docs\t%d\n", 653 684 (data->queryInfo->index == NULL ? "<none loaded>" : data->queryInfo->index), 654 685 (data->defaultStemMethod & 1), 655 686 (data->defaultStemMethod & 2), 656 (data->queryInfo->sortByRank),657 687 (data->defaultBoolCombine == 1 ? "all" : "some"), 658 688 (data->queryInfo->maxDocs));
Note:
See TracChangeset
for help on using the changeset viewer.