1 | #include <jni.h>
|
---|
2 | #include <stdio.h>
|
---|
3 | #include <stdlib.h>
|
---|
4 | #include <string.h>
|
---|
5 | #include <assert.h>
|
---|
6 | #include <math.h>
|
---|
7 | #include <ctype.h>
|
---|
8 | #include "Extractor.h"
|
---|
9 | #include "getKeywords.h"
|
---|
10 |
|
---|
11 | char slash='/';
|
---|
12 |
|
---|
13 | JNIEXPORT jstring JNICALL Java_Extractor_getKW(JNIEnv * env,jobject thisObj, jstring collection, jintArray arr)
|
---|
14 | {
|
---|
15 |
|
---|
16 | int i;
|
---|
17 | jstring jresult;
|
---|
18 | char * buffer = (char *) malloc(20000);
|
---|
19 | const jbyte * coll_byte;
|
---|
20 | char * coll;
|
---|
21 | int docs[500];
|
---|
22 |
|
---|
23 | jsize len = (*env)->GetArrayLength(env, arr);
|
---|
24 | jint *body = (*env)->GetIntArrayElements(env, arr, 0);
|
---|
25 |
|
---|
26 | coll_byte = (*env)->GetStringUTFChars(env, collection, NULL);
|
---|
27 | if( coll_byte == NULL )
|
---|
28 | {
|
---|
29 | fprintf(stderr,"Returning null");
|
---|
30 | return NULL;
|
---|
31 | }
|
---|
32 |
|
---|
33 | coll = coll_byte;
|
---|
34 |
|
---|
35 | for( i = 0; i < len; i++ ) docs[i] = body[i];
|
---|
36 |
|
---|
37 | mediate(docs,len,buffer, coll);
|
---|
38 | jresult = (*env)->NewStringUTF(env, buffer);
|
---|
39 |
|
---|
40 | (*env)->ReleaseStringUTFChars(env, collection, coll_byte);
|
---|
41 | (*env)->ReleaseIntArrayElements(env,arr,body,0);
|
---|
42 | return jresult;
|
---|
43 | }
|
---|
44 |
|
---|
45 |
|
---|
46 | void mediate(int* docs, int length, char *buffer, char *collection)
|
---|
47 | {
|
---|
48 | work(docs,length,buffer,collection);
|
---|
49 | return;
|
---|
50 | }
|
---|
51 |
|
---|
52 |
|
---|
53 | DocData *getIntVector(size_t docToRead,FILE *docTableFp,
|
---|
54 | FILE *dtNdxFp, size_t maxDoc, size_t tableLen,
|
---|
55 | size_t *recCount)
|
---|
56 | {
|
---|
57 | size_t recSize, docInf,docNext,c;
|
---|
58 | DocData *data;
|
---|
59 | recSize=sizeof(size_t)+sizeof(short);
|
---|
60 |
|
---|
61 | fseek(dtNdxFp,(docToRead-1)*sizeof(size_t),SEEK_SET);
|
---|
62 | fread(&docInf,sizeof(size_t),1,dtNdxFp);
|
---|
63 |
|
---|
64 | if(docToRead==1)
|
---|
65 | docInf=0;
|
---|
66 |
|
---|
67 | fseek(docTableFp,docInf,SEEK_SET);
|
---|
68 |
|
---|
69 | if(docToRead<maxDoc)fread(&docNext,sizeof(size_t),1,dtNdxFp);
|
---|
70 | else docNext=tableLen;
|
---|
71 |
|
---|
72 | *recCount=(docNext-docInf)/recSize;
|
---|
73 | data=(DocData *)malloc(*recCount*sizeof(DocData));
|
---|
74 | assert(data);
|
---|
75 | for(c=0;c < *recCount;c++)
|
---|
76 | {
|
---|
77 | fread(&data[c],recSize,1,docTableFp);
|
---|
78 | if(feof(docTableFp)) break;
|
---|
79 | }
|
---|
80 | return data;
|
---|
81 | }
|
---|
82 |
|
---|
83 | char * appendString(char * buffer, int * cursize, int * curptr, char *data, int len)
|
---|
84 | {
|
---|
85 | if (!len) return (buffer);
|
---|
86 | if (len + *curptr + 1 > *cursize) {
|
---|
87 | while (len + *curptr + 1 > *cursize) *cursize += 256;
|
---|
88 | buffer = (char *) realloc ((void *) buffer, *cursize);
|
---|
89 | }
|
---|
90 | strncpy (buffer + *curptr, data, len);
|
---|
91 | *curptr += len;
|
---|
92 | buffer[*curptr] = '\0';
|
---|
93 | return (buffer);
|
---|
94 | }
|
---|
95 |
|
---|
96 | void work(int *docNums, int numDocs, char * buffer, char *coll)
|
---|
97 | {
|
---|
98 | DocRec *allVectors;
|
---|
99 | InterestItem *itemArray;
|
---|
100 |
|
---|
101 | char bbuf[40];
|
---|
102 | int cursize = 20000;
|
---|
103 | int curptr = 0;
|
---|
104 |
|
---|
105 | size_t arraySize;
|
---|
106 | size_t maxDoc;
|
---|
107 | FILE *index;
|
---|
108 | FILE *data;
|
---|
109 | FILE *words;
|
---|
110 | FILE *binndx;
|
---|
111 | FILE *freqs;
|
---|
112 | char *dir, indexName[STRING_BUFF_LEN],dataName[STRING_BUFF_LEN],wordsName[STRING_BUFF_LEN],
|
---|
113 | binndxName[STRING_BUFF_LEN],freqName[STRING_BUFF_LEN], buff[STRING_BUFF_LEN],
|
---|
114 | indexDir[STRING_BUFF_LEN];
|
---|
115 |
|
---|
116 | size_t *freqList,wordCount,c;
|
---|
117 | size_t n,vectorCount, offset;
|
---|
118 | HjTreePtr tree;
|
---|
119 |
|
---|
120 | allVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
|
---|
121 | assert(allVectors);
|
---|
122 | itemArray = calloc(MAX_IWORDS,sizeof(InterestItem));
|
---|
123 | assert(itemArray);
|
---|
124 |
|
---|
125 | dir = "\0";
|
---|
126 | dir = getenv("GSDL3HOME");
|
---|
127 |
|
---|
128 | sprintf(indexDir,"%s%cweb/sites/localsite/collect/%s/cw_index/",dir,slash,coll);
|
---|
129 | dir = getenv("GSDL3HOME");
|
---|
130 |
|
---|
131 | sprintf(indexName,"%s%cdoctable.ndx",indexDir,slash);
|
---|
132 | sprintf(dataName,"%s%cdoctable.data",indexDir,slash);
|
---|
133 | sprintf(wordsName,"%s%cnewinterest.txt",indexDir,slash);
|
---|
134 | sprintf(binndxName,"%s%cnewbnndexFn.ndx",indexDir,slash);
|
---|
135 | sprintf(freqName,"%s%cnewdocFreq.dat",indexDir,slash);
|
---|
136 |
|
---|
137 | index = fopen(indexName,"rb");
|
---|
138 | assert(index);
|
---|
139 | data = fopen(dataName,"rb");
|
---|
140 | assert(data);
|
---|
141 | words=fopen(wordsName,"r");
|
---|
142 | assert(words);
|
---|
143 | binndx=fopen(binndxName,"rb");
|
---|
144 | assert(binndx);
|
---|
145 | freqs=fopen(freqName,"rb");
|
---|
146 | assert(freqs);
|
---|
147 |
|
---|
148 | wordCount=fileSize(binndx)/sizeof(size_t);
|
---|
149 | fread(&maxDoc,sizeof(size_t),1,index);
|
---|
150 | freqList=malloc(wordCount*sizeof(size_t));
|
---|
151 | assert(freqList);
|
---|
152 |
|
---|
153 | // reads at most wordCount elements of size size_t into freqList
|
---|
154 | n=fread(freqList,sizeof(size_t),wordCount,freqs);
|
---|
155 | if(n != wordCount) fprintf(stderr,"n %d wordCount %d\n",n,wordCount);
|
---|
156 | assert(n==wordCount);
|
---|
157 | fclose(freqs);
|
---|
158 |
|
---|
159 | vectorCount=fillAllVectors(allVectors, docNums, numDocs, data,index,maxDoc);
|
---|
160 |
|
---|
161 | tree=constructWordList(freqList, allVectors, vectorCount);
|
---|
162 | arraySize = fillInterestArray(tree, MAX_IWORDS, 0,itemArray);
|
---|
163 | vapeTree(tree);
|
---|
164 |
|
---|
165 | *buffer = '\0';
|
---|
166 |
|
---|
167 | // print interesting words
|
---|
168 | n= sprintf(bbuf,"%d\n",arraySize);
|
---|
169 | buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
170 |
|
---|
171 | for(c=0;c<arraySize;c++)
|
---|
172 | {
|
---|
173 | fseek(binndx,itemArray[c].itemPos*sizeof(size_t),SEEK_SET);
|
---|
174 | fread(&offset,sizeof(size_t),1,binndx);
|
---|
175 | fseek(words,offset,SEEK_SET);
|
---|
176 | fgets(buff,STRING_BUFF_LEN,words);
|
---|
177 | buffer = appendString(buffer,&cursize,&curptr,buff,strlen(buff));
|
---|
178 | }
|
---|
179 |
|
---|
180 | for(c=0;c<vectorCount;c++)
|
---|
181 | {
|
---|
182 | free(allVectors[c].data);
|
---|
183 | }
|
---|
184 |
|
---|
185 | fclose(words);
|
---|
186 | fclose(index);
|
---|
187 | fclose(binndx);
|
---|
188 | fclose(data);
|
---|
189 |
|
---|
190 | free(allVectors);
|
---|
191 | free(freqList);
|
---|
192 | }
|
---|
193 |
|
---|
194 |
|
---|
195 | size_t getDocNum(FILE *fp, char *buff)
|
---|
196 | {
|
---|
197 | char *p;
|
---|
198 | int c;
|
---|
199 | size_t result;
|
---|
200 | do
|
---|
201 | {
|
---|
202 | c = fgetc(fp);
|
---|
203 |
|
---|
204 | } while (!isdigit(c)&& !feof(fp));
|
---|
205 |
|
---|
206 | p=buff;
|
---|
207 |
|
---|
208 | while (isdigit(c)&& !feof(fp))
|
---|
209 | {
|
---|
210 | *(p++) = c;
|
---|
211 | c = fgetc(fp);
|
---|
212 | }
|
---|
213 | *p=0;
|
---|
214 | result= atoi(buff);
|
---|
215 | if(result)
|
---|
216 | {
|
---|
217 | for (c=0;c<DESCRIPTION_SIZE && !feof(fp);c++)
|
---|
218 | {
|
---|
219 | buff[c]=fgetc(fp);
|
---|
220 | if(buff[c]==2)
|
---|
221 | {
|
---|
222 | buff[c]=0;
|
---|
223 | break;
|
---|
224 | }
|
---|
225 |
|
---|
226 | }
|
---|
227 | }
|
---|
228 | return result;
|
---|
229 | }
|
---|
230 |
|
---|
231 | size_t fileSize(FILE *fp)
|
---|
232 | {
|
---|
233 | size_t len;
|
---|
234 | fseek(fp,0,SEEK_END);
|
---|
235 | len=ftell(fp);
|
---|
236 | rewind(fp);
|
---|
237 | return len;
|
---|
238 | }
|
---|
239 |
|
---|
240 | size_t fillAllVectors(DocRec *allVectors, int* docs, int length, FILE *docTableFp,FILE *dtNdxFp, size_t maxDoc)
|
---|
241 | {
|
---|
242 | size_t tableLen, recCount, vectorCount=0,docNum=1;
|
---|
243 |
|
---|
244 | tableLen=fileSize(docTableFp);
|
---|
245 |
|
---|
246 | while (docNum > 0 && vectorCount < length && vectorCount < MAX_DOCUMENTS && maxDoc!=0)
|
---|
247 | {
|
---|
248 | docNum = docs[vectorCount];
|
---|
249 | if(docNum>0)
|
---|
250 | {
|
---|
251 | allVectors[vectorCount].data=getIntVector(docNum,docTableFp,dtNdxFp, maxDoc, tableLen, &recCount);
|
---|
252 | allVectors[vectorCount].count=recCount;
|
---|
253 | allVectors[vectorCount].docNum=docNum;
|
---|
254 | vectorCount++;
|
---|
255 | }
|
---|
256 |
|
---|
257 | }
|
---|
258 | return vectorCount;
|
---|
259 | }
|
---|
260 |
|
---|
261 | HjTreePtr constructWordList(size_t *freqList,DocRec *allVectors, size_t vectorSize)
|
---|
262 | {
|
---|
263 | HjTreePtr tree=NULL,newTree=NULL;
|
---|
264 | size_t dj, c,d, e, m,n,pos;
|
---|
265 |
|
---|
266 | for(c=0;c<vectorSize;c++)
|
---|
267 | {
|
---|
268 | n=allVectors[c].count;
|
---|
269 | m=n/2;
|
---|
270 | for(d=m,e=m-1;d<n;d++,e--)
|
---|
271 | {
|
---|
272 |
|
---|
273 | //allVectors[c].data[d].pos;
|
---|
274 | //this is an insane device to get a flatter tree from already ordered data by
|
---|
275 | //jumbling up the order that it is added to the tree
|
---|
276 |
|
---|
277 | if(e>0)
|
---|
278 | {
|
---|
279 | pos=allVectors[c].data[e].pos;
|
---|
280 | dj=freqList[pos];
|
---|
281 | tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
|
---|
282 |
|
---|
283 | pos=allVectors[c].data[d].pos;
|
---|
284 | dj=freqList[pos];
|
---|
285 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
286 | }
|
---|
287 | if(e==0)
|
---|
288 | {
|
---|
289 | pos=allVectors[c].data[e].pos;
|
---|
290 | dj=freqList[pos];
|
---|
291 | tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
|
---|
292 |
|
---|
293 | pos=allVectors[c].data[d].pos;
|
---|
294 | dj=freqList[pos];
|
---|
295 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
296 |
|
---|
297 | if( n % 2 == 1 )
|
---|
298 | {
|
---|
299 | pos=allVectors[c].data[++d].pos;
|
---|
300 | dj=freqList[pos];
|
---|
301 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
302 | break;
|
---|
303 | }
|
---|
304 |
|
---|
305 |
|
---|
306 | }
|
---|
307 |
|
---|
308 | }
|
---|
309 | }
|
---|
310 |
|
---|
311 | // get candidate keyword weights
|
---|
312 | calcInterestVal(tree,vectorSize);
|
---|
313 | newTree=treeByInterestVal(&tree,newTree);
|
---|
314 | return newTree;
|
---|
315 |
|
---|
316 | }
|
---|
317 |
|
---|
318 | HjTreePtr buildHjTree(HjTreePtr tree, DocData *item, size_t freq)
|
---|
319 | {
|
---|
320 | HjTreePtr root=tree, /*lastNode,*/ *treePtr;
|
---|
321 | treePtr = &tree;
|
---|
322 | while(1)
|
---|
323 | {
|
---|
324 | if(*treePtr==NULL)
|
---|
325 | {
|
---|
326 | *treePtr=malloc(sizeof(HjTree));
|
---|
327 | assert (*treePtr);
|
---|
328 | (*treePtr)->less=NULL;
|
---|
329 | (*treePtr)->more=NULL;
|
---|
330 | (*treePtr)->dj=freq;
|
---|
331 | (*treePtr)->hj=1;
|
---|
332 | (*treePtr)->pos=item->pos;
|
---|
333 | (*treePtr)->interestVal=0.0;
|
---|
334 | if (root==NULL) root=(*treePtr);
|
---|
335 | tree = (*treePtr);
|
---|
336 | break;
|
---|
337 | }
|
---|
338 | else
|
---|
339 | {
|
---|
340 | if(tree->pos==item->pos)
|
---|
341 | {
|
---|
342 | tree->hj++;
|
---|
343 | break;
|
---|
344 | }
|
---|
345 | else if((*treePtr)->dj < freq)
|
---|
346 | {
|
---|
347 | treePtr = &(tree->more);
|
---|
348 | tree = (*treePtr);
|
---|
349 | }
|
---|
350 | else
|
---|
351 | {
|
---|
352 | treePtr= &(tree->less);
|
---|
353 | tree = (*treePtr);
|
---|
354 | }
|
---|
355 | }
|
---|
356 | }
|
---|
357 | return root;
|
---|
358 | }
|
---|
359 |
|
---|
360 | void calcInterestVal(HjTreePtr tree,size_t vectorSize)
|
---|
361 | {
|
---|
362 | size_t hj,dj;
|
---|
363 |
|
---|
364 | while(tree)
|
---|
365 | {
|
---|
366 | hj=tree->hj;
|
---|
367 | dj=tree->dj;
|
---|
368 | if(tree->more)
|
---|
369 | calcInterestVal(tree->more, vectorSize);
|
---|
370 | tree->interestVal=((double)hj/(double)dj)*hj*log((double)vectorSize/hj);
|
---|
371 | tree=tree->less;
|
---|
372 | }
|
---|
373 | }
|
---|
374 |
|
---|
375 | HjTreePtr getLeaf(HjTreePtr *tree)
|
---|
376 | {
|
---|
377 | HjTreePtr result;
|
---|
378 | if((*tree)->more) return getLeaf(&((*tree)->more));
|
---|
379 | if((*tree)->less) return getLeaf(&((*tree)->less));
|
---|
380 | else result=*tree;
|
---|
381 | *tree=NULL;
|
---|
382 | return result;
|
---|
383 | }
|
---|
384 |
|
---|
385 | HjTreePtr insertByInterestVal(HjTreePtr tree,HjTreePtr node)
|
---|
386 | {
|
---|
387 | if(tree==NULL)
|
---|
388 | tree=node;
|
---|
389 | else if(tree->interestVal < node->interestVal)
|
---|
390 | tree->more=insertByInterestVal(tree->more,node);
|
---|
391 | else
|
---|
392 | tree->less=insertByInterestVal(tree->less,node);
|
---|
393 | return tree;
|
---|
394 | }
|
---|
395 |
|
---|
396 | HjTreePtr treeByInterestVal(HjTreePtr *tree, HjTreePtr newTree)
|
---|
397 | {
|
---|
398 | HjTreePtr node;
|
---|
399 | while(*tree)
|
---|
400 | {
|
---|
401 | node=getLeaf(tree);
|
---|
402 | if(node)newTree=insertByInterestVal(newTree, node);
|
---|
403 | }
|
---|
404 | return newTree;
|
---|
405 | }
|
---|
406 |
|
---|
407 | size_t fillInterestArray(HjTreePtr tree, size_t max, size_t count,
|
---|
408 | InterestItem *itemArray)
|
---|
409 | {
|
---|
410 | while(tree)
|
---|
411 | {
|
---|
412 | if(count==max) return count;
|
---|
413 | if(tree->more)
|
---|
414 | count=fillInterestArray(tree->more,max, count,itemArray );
|
---|
415 | if(count==max) return count;
|
---|
416 | itemArray[count].interestVal = tree->interestVal;
|
---|
417 | itemArray[count].itemPos = tree->pos;
|
---|
418 | itemArray[count].dj = tree->dj;
|
---|
419 | count++;
|
---|
420 | tree=tree->less;
|
---|
421 | }
|
---|
422 | return count;
|
---|
423 | }
|
---|
424 |
|
---|
425 | void vapeTree(HjTreePtr tree)
|
---|
426 | {
|
---|
427 | if(!tree)return;
|
---|
428 | vapeTree(tree->more);
|
---|
429 | vapeTree(tree->less);
|
---|
430 | free(tree);
|
---|
431 | }
|
---|