#include #include #include #include #include #include #include #include "Extractor.h" #include "getKeywords.h" char slash='/'; JNIEXPORT jstring JNICALL Java_Extractor_getKW(JNIEnv * env,jobject thisObj, jstring collection, jintArray arr) { int i; jstring jresult; char * buffer = (char *) malloc(20000); const jbyte * coll_byte; char * coll; int docs[500]; jsize len = (*env)->GetArrayLength(env, arr); jint *body = (*env)->GetIntArrayElements(env, arr, 0); coll_byte = (*env)->GetStringUTFChars(env, collection, NULL); if( coll_byte == NULL ) { fprintf(stderr,"Returning null"); return NULL; } coll = coll_byte; for( i = 0; i < len; i++ ) docs[i] = body[i]; mediate(docs,len,buffer, coll); jresult = (*env)->NewStringUTF(env, buffer); (*env)->ReleaseStringUTFChars(env, collection, coll_byte); (*env)->ReleaseIntArrayElements(env,arr,body,0); return jresult; } void mediate(int* docs, int length, char *buffer, char *collection) { work(docs,length,buffer,collection); return; } DocData *getIntVector(size_t docToRead,FILE *docTableFp, FILE *dtNdxFp, size_t maxDoc, size_t tableLen, size_t *recCount) { size_t recSize, docInf,docNext,c; DocData *data; recSize=sizeof(size_t)+sizeof(short); fseek(dtNdxFp,(docToRead-1)*sizeof(size_t),SEEK_SET); fread(&docInf,sizeof(size_t),1,dtNdxFp); if(docToRead==1) docInf=0; fseek(docTableFp,docInf,SEEK_SET); if(docToRead *cursize) { while (len + *curptr + 1 > *cursize) *cursize += 256; buffer = (char *) realloc ((void *) buffer, *cursize); } strncpy (buffer + *curptr, data, len); *curptr += len; buffer[*curptr] = '\0'; return (buffer); } void work(int *docNums, int numDocs, char * buffer, char *coll) { DocRec *allVectors; InterestItem *itemArray; char bbuf[40]; int cursize = 20000; int curptr = 0; size_t arraySize; size_t maxDoc; FILE *index; FILE *data; FILE *words; FILE *binndx; FILE *freqs; char *dir, indexName[STRING_BUFF_LEN],dataName[STRING_BUFF_LEN],wordsName[STRING_BUFF_LEN], binndxName[STRING_BUFF_LEN],freqName[STRING_BUFF_LEN], buff[STRING_BUFF_LEN], indexDir[STRING_BUFF_LEN]; size_t *freqList,wordCount,c; size_t n,vectorCount, offset; HjTreePtr tree; allVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec)); assert(allVectors); itemArray = calloc(MAX_IWORDS,sizeof(InterestItem)); assert(itemArray); dir = "\0"; dir = getenv("GSDL3HOME"); sprintf(indexDir,"%s%cweb/sites/localsite/collect/%s/cw_index/",dir,slash,coll); dir = getenv("GSDL3HOME"); sprintf(indexName,"%s%cdoctable.ndx",indexDir,slash); sprintf(dataName,"%s%cdoctable.data",indexDir,slash); sprintf(wordsName,"%s%cnewinterest.txt",indexDir,slash); sprintf(binndxName,"%s%cnewbnndexFn.ndx",indexDir,slash); sprintf(freqName,"%s%cnewdocFreq.dat",indexDir,slash); index = fopen(indexName,"rb"); assert(index); data = fopen(dataName,"rb"); assert(data); words=fopen(wordsName,"r"); assert(words); binndx=fopen(binndxName,"rb"); assert(binndx); freqs=fopen(freqName,"rb"); assert(freqs); wordCount=fileSize(binndx)/sizeof(size_t); fread(&maxDoc,sizeof(size_t),1,index); freqList=malloc(wordCount*sizeof(size_t)); assert(freqList); // reads at most wordCount elements of size size_t into freqList n=fread(freqList,sizeof(size_t),wordCount,freqs); if(n != wordCount) fprintf(stderr,"n %d wordCount %d\n",n,wordCount); assert(n==wordCount); fclose(freqs); vectorCount=fillAllVectors(allVectors, docNums, numDocs, data,index,maxDoc); tree=constructWordList(freqList, allVectors, vectorCount); arraySize = fillInterestArray(tree, MAX_IWORDS, 0,itemArray); vapeTree(tree); *buffer = '\0'; // print interesting words n= sprintf(bbuf,"%d\n",arraySize); buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf)); for(c=0;c 0 && vectorCount < length && vectorCount < MAX_DOCUMENTS && maxDoc!=0) { docNum = docs[vectorCount]; if(docNum>0) { allVectors[vectorCount].data=getIntVector(docNum,docTableFp,dtNdxFp, maxDoc, tableLen, &recCount); allVectors[vectorCount].count=recCount; allVectors[vectorCount].docNum=docNum; vectorCount++; } } return vectorCount; } HjTreePtr constructWordList(size_t *freqList,DocRec *allVectors, size_t vectorSize) { HjTreePtr tree=NULL,newTree=NULL; size_t dj, c,d, e, m,n,pos; for(c=0;c0) { pos=allVectors[c].data[e].pos; dj=freqList[pos]; tree=buildHjTree(tree,&(allVectors[c].data[e]), dj); pos=allVectors[c].data[d].pos; dj=freqList[pos]; tree=buildHjTree(tree,&(allVectors[c].data[d]), dj); } if(e==0) { pos=allVectors[c].data[e].pos; dj=freqList[pos]; tree=buildHjTree(tree,&(allVectors[c].data[e]), dj); pos=allVectors[c].data[d].pos; dj=freqList[pos]; tree=buildHjTree(tree,&(allVectors[c].data[d]), dj); if( n % 2 == 1 ) { pos=allVectors[c].data[++d].pos; dj=freqList[pos]; tree=buildHjTree(tree,&(allVectors[c].data[d]), dj); break; } } } } // get candidate keyword weights calcInterestVal(tree,vectorSize); newTree=treeByInterestVal(&tree,newTree); return newTree; } HjTreePtr buildHjTree(HjTreePtr tree, DocData *item, size_t freq) { HjTreePtr root=tree, /*lastNode,*/ *treePtr; treePtr = &tree; while(1) { if(*treePtr==NULL) { *treePtr=malloc(sizeof(HjTree)); assert (*treePtr); (*treePtr)->less=NULL; (*treePtr)->more=NULL; (*treePtr)->dj=freq; (*treePtr)->hj=1; (*treePtr)->pos=item->pos; (*treePtr)->interestVal=0.0; if (root==NULL) root=(*treePtr); tree = (*treePtr); break; } else { if(tree->pos==item->pos) { tree->hj++; break; } else if((*treePtr)->dj < freq) { treePtr = &(tree->more); tree = (*treePtr); } else { treePtr= &(tree->less); tree = (*treePtr); } } } return root; } void calcInterestVal(HjTreePtr tree,size_t vectorSize) { size_t hj,dj; while(tree) { hj=tree->hj; dj=tree->dj; if(tree->more) calcInterestVal(tree->more, vectorSize); tree->interestVal=((double)hj/(double)dj)*hj*log((double)vectorSize/hj); tree=tree->less; } } HjTreePtr getLeaf(HjTreePtr *tree) { HjTreePtr result; if((*tree)->more) return getLeaf(&((*tree)->more)); if((*tree)->less) return getLeaf(&((*tree)->less)); else result=*tree; *tree=NULL; return result; } HjTreePtr insertByInterestVal(HjTreePtr tree,HjTreePtr node) { if(tree==NULL) tree=node; else if(tree->interestVal < node->interestVal) tree->more=insertByInterestVal(tree->more,node); else tree->less=insertByInterestVal(tree->less,node); return tree; } HjTreePtr treeByInterestVal(HjTreePtr *tree, HjTreePtr newTree) { HjTreePtr node; while(*tree) { node=getLeaf(tree); if(node)newTree=insertByInterestVal(newTree, node); } return newTree; } size_t fillInterestArray(HjTreePtr tree, size_t max, size_t count, InterestItem *itemArray) { while(tree) { if(count==max) return count; if(tree->more) count=fillInterestArray(tree->more,max, count,itemArray ); if(count==max) return count; itemArray[count].interestVal = tree->interestVal; itemArray[count].itemPos = tree->pos; itemArray[count].dj = tree->dj; count++; tree=tree->less; } return count; } void vapeTree(HjTreePtr tree) { if(!tree)return; vapeTree(tree->more); vapeTree(tree->less); free(tree); }