source: trunk/greenstone3-extensions/vishnu/src/cluster/getKeywords.c@ 8189

Last change on this file since 8189 was 8189, checked in by kjdon, 20 years ago

first version of Imperial College's Visualiser code

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1#include <jni.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <assert.h>
6#include <math.h>
7#include <ctype.h>
8#include "Extractor.h"
9#include "getKeywords.h"
10
11char slash='/';
12
13JNIEXPORT jstring JNICALL Java_Extractor_getKW(JNIEnv * env,jobject thisObj, jstring collection, jintArray arr)
14{
15
16 int i;
17 jstring jresult;
18 char * buffer = (char *) malloc(20000);
19 const jbyte * coll_byte;
20 char * coll;
21 int docs[500];
22
23 jsize len = (*env)->GetArrayLength(env, arr);
24 jint *body = (*env)->GetIntArrayElements(env, arr, 0);
25
26 coll_byte = (*env)->GetStringUTFChars(env, collection, NULL);
27 if( coll_byte == NULL )
28 {
29 fprintf(stderr,"Returning null");
30 return NULL;
31 }
32
33 coll = coll_byte;
34
35 for( i = 0; i < len; i++ ) docs[i] = body[i];
36
37 mediate(docs,len,buffer, coll);
38 jresult = (*env)->NewStringUTF(env, buffer);
39
40 (*env)->ReleaseStringUTFChars(env, collection, coll_byte);
41 (*env)->ReleaseIntArrayElements(env,arr,body,0);
42 return jresult;
43}
44
45
46void mediate(int* docs, int length, char *buffer, char *collection)
47{
48 work(docs,length,buffer,collection);
49 return;
50}
51
52
53DocData *getIntVector(size_t docToRead,FILE *docTableFp,
54 FILE *dtNdxFp, size_t maxDoc, size_t tableLen,
55 size_t *recCount)
56{
57 size_t recSize, docInf,docNext,c;
58 DocData *data;
59 recSize=sizeof(size_t)+sizeof(short);
60
61 fseek(dtNdxFp,(docToRead-1)*sizeof(size_t),SEEK_SET);
62 fread(&docInf,sizeof(size_t),1,dtNdxFp);
63
64 if(docToRead==1)
65 docInf=0;
66
67 fseek(docTableFp,docInf,SEEK_SET);
68
69 if(docToRead<maxDoc)fread(&docNext,sizeof(size_t),1,dtNdxFp);
70 else docNext=tableLen;
71
72 *recCount=(docNext-docInf)/recSize;
73 data=(DocData *)malloc(*recCount*sizeof(DocData));
74 assert(data);
75 for(c=0;c < *recCount;c++)
76 {
77 fread(&data[c],recSize,1,docTableFp);
78 if(feof(docTableFp)) break;
79 }
80 return data;
81}
82
83char * appendString(char * buffer, int * cursize, int * curptr, char *data, int len)
84{
85 if (!len) return (buffer);
86 if (len + *curptr + 1 > *cursize) {
87 while (len + *curptr + 1 > *cursize) *cursize += 256;
88 buffer = (char *) realloc ((void *) buffer, *cursize);
89 }
90 strncpy (buffer + *curptr, data, len);
91 *curptr += len;
92 buffer[*curptr] = '\0';
93 return (buffer);
94}
95
96void work(int *docNums, int numDocs, char * buffer, char *coll)
97{
98 DocRec *allVectors;
99 InterestItem *itemArray;
100
101 char bbuf[40];
102 int cursize = 20000;
103 int curptr = 0;
104
105 size_t arraySize;
106 size_t maxDoc;
107 FILE *index;
108 FILE *data;
109 FILE *words;
110 FILE *binndx;
111 FILE *freqs;
112 char *dir, indexName[STRING_BUFF_LEN],dataName[STRING_BUFF_LEN],wordsName[STRING_BUFF_LEN],
113 binndxName[STRING_BUFF_LEN],freqName[STRING_BUFF_LEN], buff[STRING_BUFF_LEN],
114 indexDir[STRING_BUFF_LEN];
115
116 size_t *freqList,wordCount,c;
117 size_t n,vectorCount, offset;
118 HjTreePtr tree;
119
120 allVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
121 assert(allVectors);
122 itemArray = calloc(MAX_IWORDS,sizeof(InterestItem));
123 assert(itemArray);
124
125 dir = "\0";
126 dir = getenv("GSDL3HOME");
127
128 sprintf(indexDir,"%s%cweb/sites/localsite/collect/%s/cw_index/",dir,slash,coll);
129 dir = getenv("GSDL3HOME");
130
131 sprintf(indexName,"%s%cdoctable.ndx",indexDir,slash);
132 sprintf(dataName,"%s%cdoctable.data",indexDir,slash);
133 sprintf(wordsName,"%s%cnewinterest.txt",indexDir,slash);
134 sprintf(binndxName,"%s%cnewbnndexFn.ndx",indexDir,slash);
135 sprintf(freqName,"%s%cnewdocFreq.dat",indexDir,slash);
136
137 index = fopen(indexName,"rb");
138 assert(index);
139 data = fopen(dataName,"rb");
140 assert(data);
141 words=fopen(wordsName,"r");
142 assert(words);
143 binndx=fopen(binndxName,"rb");
144 assert(binndx);
145 freqs=fopen(freqName,"rb");
146 assert(freqs);
147
148 wordCount=fileSize(binndx)/sizeof(size_t);
149 fread(&maxDoc,sizeof(size_t),1,index);
150 freqList=malloc(wordCount*sizeof(size_t));
151 assert(freqList);
152
153 // reads at most wordCount elements of size size_t into freqList
154 n=fread(freqList,sizeof(size_t),wordCount,freqs);
155 if(n != wordCount) fprintf(stderr,"n %d wordCount %d\n",n,wordCount);
156 assert(n==wordCount);
157 fclose(freqs);
158
159 vectorCount=fillAllVectors(allVectors, docNums, numDocs, data,index,maxDoc);
160
161 tree=constructWordList(freqList, allVectors, vectorCount);
162 arraySize = fillInterestArray(tree, MAX_IWORDS, 0,itemArray);
163 vapeTree(tree);
164
165 *buffer = '\0';
166
167 // print interesting words
168 n= sprintf(bbuf,"%d\n",arraySize);
169 buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
170
171 for(c=0;c<arraySize;c++)
172 {
173 fseek(binndx,itemArray[c].itemPos*sizeof(size_t),SEEK_SET);
174 fread(&offset,sizeof(size_t),1,binndx);
175 fseek(words,offset,SEEK_SET);
176 fgets(buff,STRING_BUFF_LEN,words);
177 buffer = appendString(buffer,&cursize,&curptr,buff,strlen(buff));
178 }
179
180 for(c=0;c<vectorCount;c++)
181 {
182 free(allVectors[c].data);
183 }
184
185 fclose(words);
186 fclose(index);
187 fclose(binndx);
188 fclose(data);
189
190 free(allVectors);
191 free(freqList);
192}
193
194
195size_t getDocNum(FILE *fp, char *buff)
196{
197 char *p;
198 int c;
199 size_t result;
200 do
201 {
202 c = fgetc(fp);
203
204 } while (!isdigit(c)&& !feof(fp));
205
206 p=buff;
207
208 while (isdigit(c)&& !feof(fp))
209 {
210 *(p++) = c;
211 c = fgetc(fp);
212 }
213 *p=0;
214 result= atoi(buff);
215 if(result)
216 {
217 for (c=0;c<DESCRIPTION_SIZE && !feof(fp);c++)
218 {
219 buff[c]=fgetc(fp);
220 if(buff[c]==2)
221 {
222 buff[c]=0;
223 break;
224 }
225
226 }
227 }
228 return result;
229}
230
231size_t fileSize(FILE *fp)
232{
233 size_t len;
234 fseek(fp,0,SEEK_END);
235 len=ftell(fp);
236 rewind(fp);
237 return len;
238}
239
240size_t fillAllVectors(DocRec *allVectors, int* docs, int length, FILE *docTableFp,FILE *dtNdxFp, size_t maxDoc)
241{
242 size_t tableLen, recCount, vectorCount=0,docNum=1;
243
244 tableLen=fileSize(docTableFp);
245
246 while (docNum > 0 && vectorCount < length && vectorCount < MAX_DOCUMENTS && maxDoc!=0)
247 {
248 docNum = docs[vectorCount];
249 if(docNum>0)
250 {
251 allVectors[vectorCount].data=getIntVector(docNum,docTableFp,dtNdxFp, maxDoc, tableLen, &recCount);
252 allVectors[vectorCount].count=recCount;
253 allVectors[vectorCount].docNum=docNum;
254 vectorCount++;
255 }
256
257 }
258 return vectorCount;
259}
260
261HjTreePtr constructWordList(size_t *freqList,DocRec *allVectors, size_t vectorSize)
262{
263 HjTreePtr tree=NULL,newTree=NULL;
264 size_t dj, c,d, e, m,n,pos;
265
266 for(c=0;c<vectorSize;c++)
267 {
268 n=allVectors[c].count;
269 m=n/2;
270 for(d=m,e=m-1;d<n;d++,e--)
271 {
272
273 //allVectors[c].data[d].pos;
274 //this is an insane device to get a flatter tree from already ordered data by
275 //jumbling up the order that it is added to the tree
276
277 if(e>0)
278 {
279 pos=allVectors[c].data[e].pos;
280 dj=freqList[pos];
281 tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
282
283 pos=allVectors[c].data[d].pos;
284 dj=freqList[pos];
285 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
286 }
287 if(e==0)
288 {
289 pos=allVectors[c].data[e].pos;
290 dj=freqList[pos];
291 tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
292
293 pos=allVectors[c].data[d].pos;
294 dj=freqList[pos];
295 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
296
297 if( n % 2 == 1 )
298 {
299 pos=allVectors[c].data[++d].pos;
300 dj=freqList[pos];
301 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
302 break;
303 }
304
305
306 }
307
308 }
309 }
310
311 // get candidate keyword weights
312 calcInterestVal(tree,vectorSize);
313 newTree=treeByInterestVal(&tree,newTree);
314 return newTree;
315
316}
317
318HjTreePtr buildHjTree(HjTreePtr tree, DocData *item, size_t freq)
319{
320 HjTreePtr root=tree, /*lastNode,*/ *treePtr;
321 treePtr = &tree;
322 while(1)
323 {
324 if(*treePtr==NULL)
325 {
326 *treePtr=malloc(sizeof(HjTree));
327 assert (*treePtr);
328 (*treePtr)->less=NULL;
329 (*treePtr)->more=NULL;
330 (*treePtr)->dj=freq;
331 (*treePtr)->hj=1;
332 (*treePtr)->pos=item->pos;
333 (*treePtr)->interestVal=0.0;
334 if (root==NULL) root=(*treePtr);
335 tree = (*treePtr);
336 break;
337 }
338 else
339 {
340 if(tree->pos==item->pos)
341 {
342 tree->hj++;
343 break;
344 }
345 else if((*treePtr)->dj < freq)
346 {
347 treePtr = &(tree->more);
348 tree = (*treePtr);
349 }
350 else
351 {
352 treePtr= &(tree->less);
353 tree = (*treePtr);
354 }
355 }
356 }
357 return root;
358}
359
360void calcInterestVal(HjTreePtr tree,size_t vectorSize)
361{
362 size_t hj,dj;
363
364 while(tree)
365 {
366 hj=tree->hj;
367 dj=tree->dj;
368 if(tree->more)
369 calcInterestVal(tree->more, vectorSize);
370 tree->interestVal=((double)hj/(double)dj)*hj*log((double)vectorSize/hj);
371 tree=tree->less;
372 }
373}
374
375HjTreePtr getLeaf(HjTreePtr *tree)
376{
377 HjTreePtr result;
378 if((*tree)->more) return getLeaf(&((*tree)->more));
379 if((*tree)->less) return getLeaf(&((*tree)->less));
380 else result=*tree;
381 *tree=NULL;
382 return result;
383}
384
385HjTreePtr insertByInterestVal(HjTreePtr tree,HjTreePtr node)
386{
387 if(tree==NULL)
388 tree=node;
389 else if(tree->interestVal < node->interestVal)
390 tree->more=insertByInterestVal(tree->more,node);
391 else
392 tree->less=insertByInterestVal(tree->less,node);
393 return tree;
394}
395
396HjTreePtr treeByInterestVal(HjTreePtr *tree, HjTreePtr newTree)
397{
398 HjTreePtr node;
399 while(*tree)
400 {
401 node=getLeaf(tree);
402 if(node)newTree=insertByInterestVal(newTree, node);
403 }
404 return newTree;
405}
406
407size_t fillInterestArray(HjTreePtr tree, size_t max, size_t count,
408 InterestItem *itemArray)
409{
410 while(tree)
411 {
412 if(count==max) return count;
413 if(tree->more)
414 count=fillInterestArray(tree->more,max, count,itemArray );
415 if(count==max) return count;
416 itemArray[count].interestVal = tree->interestVal;
417 itemArray[count].itemPos = tree->pos;
418 itemArray[count].dj = tree->dj;
419 count++;
420 tree=tree->less;
421 }
422 return count;
423}
424
425void vapeTree(HjTreePtr tree)
426{
427 if(!tree)return;
428 vapeTree(tree->more);
429 vapeTree(tree->less);
430 free(tree);
431}
Note: See TracBrowser for help on using the repository browser.