1 | #include <jni.h>
|
---|
2 | #include <stdio.h>
|
---|
3 | #include <sys/types.h>
|
---|
4 | #include <unistd.h>
|
---|
5 | #include <sys/wait.h>
|
---|
6 | #include <stdlib.h>
|
---|
7 | #include <string.h>
|
---|
8 | #include <assert.h>
|
---|
9 | #include <math.h>
|
---|
10 | #include <ctype.h>
|
---|
11 | #include <sys/times.h>
|
---|
12 | #ifndef CLK_TCK
|
---|
13 | #include <time.h>
|
---|
14 | #endif
|
---|
15 | //#include "Cluster.h"
|
---|
16 | #include "vishnu_server_Cluster.h"
|
---|
17 | #include "procresults.h"
|
---|
18 | #include "clustering.h"
|
---|
19 |
|
---|
20 | char slash='/';
|
---|
21 |
|
---|
22 |
|
---|
23 | JNIEXPORT jstring JNICALL Java_vishnu_server_Cluster_getCC(JNIEnv * env,jobject thisObj,
|
---|
24 | jintArray arr, jstring collection_home)
|
---|
25 | {
|
---|
26 | fprintf(stderr,"in getCC");
|
---|
27 | int i;
|
---|
28 | jstring jresult;
|
---|
29 | char * buffer = (char *) malloc(20000);
|
---|
30 | const jbyte * coll_byte;
|
---|
31 | char * coll_home;
|
---|
32 | int docs[500];
|
---|
33 |
|
---|
34 | jsize len = (*env)->GetArrayLength(env, arr);
|
---|
35 | jint *body = (*env)->GetIntArrayElements(env, arr, 0);
|
---|
36 |
|
---|
37 | coll_byte = (*env)->GetStringUTFChars(env, collection_home, NULL);
|
---|
38 | if( coll_byte == NULL )
|
---|
39 | {
|
---|
40 | fprintf(stderr,"getCC: colleciton_home is null, returning null");
|
---|
41 | return NULL;
|
---|
42 | }
|
---|
43 | coll_home = coll_byte;
|
---|
44 |
|
---|
45 | for( i = 0; i < len; i++ ) docs[i] = body[i];
|
---|
46 |
|
---|
47 | mediate(docs,len,buffer, coll_home);
|
---|
48 | jresult = (*env)->NewStringUTF(env, buffer);
|
---|
49 |
|
---|
50 | (*env)->ReleaseStringUTFChars(env, collection_home, coll_byte);
|
---|
51 | (*env)->ReleaseIntArrayElements(env,arr,body,0);
|
---|
52 | return jresult;
|
---|
53 | }
|
---|
54 |
|
---|
55 |
|
---|
56 | void mediate(int* docs, int length, char *buffer, char *coll_home)
|
---|
57 | {
|
---|
58 | work(docs,length,buffer,coll_home);
|
---|
59 | return;
|
---|
60 | }
|
---|
61 |
|
---|
62 |
|
---|
63 | DocData *getIntVector(size_t docToRead,FILE *docTableFp,
|
---|
64 | FILE *dtNdxFp, size_t maxDoc, size_t tableLen,
|
---|
65 | size_t *recCount)
|
---|
66 | {
|
---|
67 | size_t recSize, docInf,docNext,c;
|
---|
68 | DocData *data;
|
---|
69 | recSize=sizeof(size_t)+sizeof(short);
|
---|
70 |
|
---|
71 | fseek(dtNdxFp,(docToRead-1)*sizeof(size_t),SEEK_SET);
|
---|
72 | fread(&docInf,sizeof(size_t),1,dtNdxFp);
|
---|
73 |
|
---|
74 | if(docToRead==1)
|
---|
75 | docInf=0;
|
---|
76 |
|
---|
77 | fseek(docTableFp,docInf,SEEK_SET);
|
---|
78 |
|
---|
79 | if(docToRead<maxDoc)fread(&docNext,sizeof(size_t),1,dtNdxFp);
|
---|
80 | else docNext=tableLen;
|
---|
81 |
|
---|
82 |
|
---|
83 |
|
---|
84 | *recCount=(docNext-docInf)/recSize;
|
---|
85 | data=(DocData *)malloc(*recCount*sizeof(DocData));
|
---|
86 |
|
---|
87 | assert(data);
|
---|
88 | for(c=0;c < *recCount;c++)
|
---|
89 | {
|
---|
90 | fread(&data[c],recSize,1,docTableFp);
|
---|
91 | if(feof(docTableFp)) break;
|
---|
92 | }
|
---|
93 | return data;
|
---|
94 | }
|
---|
95 |
|
---|
96 | void buildMatrix(TranslationItemPtr transTable,
|
---|
97 | size_t arraySize,DocRec *allVectors,
|
---|
98 | size_t vectorCount, MatrixNodePtr *matrix)
|
---|
99 | {
|
---|
100 | MatrixNodePtr newNode;
|
---|
101 | size_t c,d,e;
|
---|
102 | for(c=0;c<vectorCount;c++)
|
---|
103 | {
|
---|
104 | e=0;
|
---|
105 | matrix[c]=NULL;
|
---|
106 | for(d=0;d<allVectors[c].count && e < arraySize;d++)
|
---|
107 | {
|
---|
108 |
|
---|
109 | while(allVectors[c].data[d].pos>transTable[e].itemPos && e < arraySize)
|
---|
110 | {
|
---|
111 | e++;
|
---|
112 | }
|
---|
113 | if(e < arraySize && allVectors[c].data[d].pos==transTable[e].itemPos)
|
---|
114 | {
|
---|
115 | /* this for straight text output
|
---|
116 | can't do this as some vectors are empty
|
---|
117 | printf("%s%d,%d",matrix[c]==NULL?"":",",
|
---|
118 | transTable[e].newColumn,allVectors[c].data[d].freq);
|
---|
119 | this is for a jni interface? */
|
---|
120 | newNode=malloc(sizeof(MatrixNode));
|
---|
121 | assert(newNode);
|
---|
122 | newNode->next=matrix[c];
|
---|
123 | newNode->column=transTable[e].newColumn;
|
---|
124 | newNode->freq=allVectors[c].data[d].freq;
|
---|
125 | matrix[c]=newNode;
|
---|
126 | }
|
---|
127 |
|
---|
128 |
|
---|
129 | }
|
---|
130 | //printf("\n");
|
---|
131 | }
|
---|
132 | }
|
---|
133 |
|
---|
134 |
|
---|
135 | int removeEmptyRows(MatrixNodePtr **matrixPtr, DocRec **allVectorsPtr,char ***descriptionsPtr,int vectorCount)
|
---|
136 | {
|
---|
137 | DocRec *newAllVectors;
|
---|
138 | MatrixNodePtr *newMatrix, current;
|
---|
139 | char **newDescriptions;
|
---|
140 | int c,d;
|
---|
141 |
|
---|
142 | newAllVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
|
---|
143 | assert(newAllVectors);
|
---|
144 | newMatrix = calloc(MAX_DOCUMENTS,sizeof(MatrixNodePtr));
|
---|
145 | assert(newMatrix);
|
---|
146 | newDescriptions=calloc(MAX_DOCUMENTS,sizeof(char *));
|
---|
147 | assert(newDescriptions);
|
---|
148 | // for(d=0,c=0;c<vectorCount;c++)
|
---|
149 | // fprintf(stderr,"current %p\n",(*matrixPtr)[c]);
|
---|
150 | for(d=0,c=0;c<vectorCount;c++)
|
---|
151 | {
|
---|
152 | current=(*matrixPtr)[c];
|
---|
153 | // fprintf(stderr,"vector %d\n",c);
|
---|
154 | fflush(stderr);
|
---|
155 | if(current != NULL)
|
---|
156 | {
|
---|
157 | // fprintf(stderr,"current %p allvectors %p descriptions %p\n",current,(*allVectorsPtr)[c],(*descriptionsPtr)[c]);
|
---|
158 | newMatrix[d] = current;
|
---|
159 | newAllVectors[d] = (*allVectorsPtr)[c];
|
---|
160 | newDescriptions[d] = (*descriptionsPtr)[c];
|
---|
161 |
|
---|
162 | // fprintf(stderr,"copied to new vector %d\n",d);
|
---|
163 | // fflush(stderr);
|
---|
164 | d++;
|
---|
165 | }
|
---|
166 | else
|
---|
167 | {
|
---|
168 | free((*allVectorsPtr)[c].data);
|
---|
169 | free((*descriptionsPtr)[c]);
|
---|
170 | }
|
---|
171 | }
|
---|
172 | free(*matrixPtr);
|
---|
173 | free(*descriptionsPtr);
|
---|
174 | free(*allVectorsPtr);
|
---|
175 | *matrixPtr = newMatrix;
|
---|
176 | *descriptionsPtr=newDescriptions;
|
---|
177 | *allVectorsPtr=newAllVectors;
|
---|
178 | return d;
|
---|
179 | }
|
---|
180 |
|
---|
181 |
|
---|
182 | int main(int argc, char **argv)
|
---|
183 | {
|
---|
184 | return 0;
|
---|
185 | }
|
---|
186 |
|
---|
187 |
|
---|
188 | char * appendString(char * buffer, int * cursize, int * curptr, char *data, int len)
|
---|
189 | {
|
---|
190 | if (!len) return (buffer);
|
---|
191 | if (len + *curptr + 1 > *cursize) {
|
---|
192 | fflush(stderr);
|
---|
193 | while (len + *curptr + 1 > *cursize) *cursize += 256;
|
---|
194 | buffer = (char *) realloc ((void *) buffer, *cursize);
|
---|
195 | }
|
---|
196 | strncpy (buffer + *curptr, data, len);
|
---|
197 | *curptr += len;
|
---|
198 | buffer[*curptr] = '\0';
|
---|
199 | return (buffer);
|
---|
200 | }
|
---|
201 |
|
---|
202 | void work(int *docNums, int numDocs, char * buffer, char *coll_home)
|
---|
203 | {
|
---|
204 | time_t baseTime,wordListTime;
|
---|
205 | struct tms *timer;
|
---|
206 |
|
---|
207 | DocRec *allVectors;
|
---|
208 | InterestItem *itemArray;
|
---|
209 | TranslationItem *transTable;
|
---|
210 | MatrixNodePtr *matrix,current,last;
|
---|
211 | char **descriptions;
|
---|
212 |
|
---|
213 | char bbuf[40];
|
---|
214 | int cursize = 20000;
|
---|
215 | int curptr = 0;
|
---|
216 |
|
---|
217 | int clusterNum = 10;
|
---|
218 | int limit = 20;
|
---|
219 |
|
---|
220 | size_t arraySize;
|
---|
221 | size_t maxDoc;
|
---|
222 | FILE *index;
|
---|
223 | FILE *data;
|
---|
224 | FILE *words;
|
---|
225 | FILE *binndx;
|
---|
226 | FILE *freqs;
|
---|
227 | char *dir, indexName[STRING_BUFF_LEN],dataName[STRING_BUFF_LEN],wordsName[STRING_BUFF_LEN],
|
---|
228 | binndxName[STRING_BUFF_LEN],freqName[STRING_BUFF_LEN], buff[STRING_BUFF_LEN],
|
---|
229 | indexDir[STRING_BUFF_LEN];
|
---|
230 |
|
---|
231 | size_t *freqList,wordCount,c;
|
---|
232 | size_t n,vectorCount, offset;
|
---|
233 | HjTreePtr tree;
|
---|
234 |
|
---|
235 | timer = (struct tms *) malloc(sizeof (struct tms));
|
---|
236 |
|
---|
237 | fprintf(stderr,"About to get into ck data\n");
|
---|
238 |
|
---|
239 | allVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
|
---|
240 | assert(allVectors);
|
---|
241 | itemArray = calloc(MAX_IWORDS,sizeof(InterestItem));
|
---|
242 | assert(itemArray);
|
---|
243 | transTable= calloc(MAX_IWORDS,sizeof(TranslationItem));
|
---|
244 | assert(transTable);
|
---|
245 | matrix = calloc(MAX_DOCUMENTS,sizeof(MatrixNodePtr));
|
---|
246 | assert(matrix);
|
---|
247 | descriptions=calloc(MAX_DOCUMENTS,sizeof(char *));
|
---|
248 | assert(descriptions);
|
---|
249 |
|
---|
250 | //dir = "\0";
|
---|
251 | //dir = getenv("GSDL3HOME");
|
---|
252 | //sprintf(indexDir,"%s%cweb/sites/localsite/collect/%s/index/ck_index",dir,slash,coll);
|
---|
253 | sprintf(indexDir, "%sindex%cck_index", coll_home, slash);
|
---|
254 | //dir = getenv("GSDL3HOME");
|
---|
255 | //fprintf(stderr,dir);
|
---|
256 |
|
---|
257 | sprintf(indexName,"%s%cdoctable.ndx",indexDir,slash);
|
---|
258 | sprintf(dataName,"%s%cdoctable.data",indexDir,slash);
|
---|
259 | sprintf(wordsName,"%s%cnewinterest.txt",indexDir,slash);
|
---|
260 | sprintf(binndxName,"%s%cnewbnndexFn.ndx",indexDir,slash);
|
---|
261 | sprintf(freqName,"%s%cnewdocFreq.dat",indexDir,slash);
|
---|
262 |
|
---|
263 | fprintf(stderr,indexName);
|
---|
264 |
|
---|
265 | index = fopen(indexName,"rb");
|
---|
266 | assert(index);
|
---|
267 | data = fopen(dataName,"rb");
|
---|
268 | assert(data);
|
---|
269 | words=fopen(wordsName,"r");
|
---|
270 | assert(words);
|
---|
271 | binndx=fopen(binndxName,"rb");
|
---|
272 | assert(binndx);
|
---|
273 | freqs=fopen(freqName,"rb");
|
---|
274 | assert(freqs);
|
---|
275 |
|
---|
276 | wordCount=fileSize(binndx)/sizeof(size_t);
|
---|
277 | fread(&maxDoc,sizeof(size_t),1,index);
|
---|
278 | freqList=malloc(wordCount*sizeof(size_t));
|
---|
279 | assert(freqList);
|
---|
280 |
|
---|
281 | // reads at most wordCount elements of size size_t into freqList
|
---|
282 | n=fread(freqList,sizeof(size_t),wordCount,freqs);
|
---|
283 | if(n != wordCount) fprintf(stderr,"n %d wordCount %d\n",n,wordCount);
|
---|
284 | assert(n==wordCount);
|
---|
285 | fclose(freqs);
|
---|
286 |
|
---|
287 | vectorCount=fillAllVectors(allVectors, docNums, numDocs, data,index,maxDoc, descriptions);
|
---|
288 |
|
---|
289 | // begin processing results
|
---|
290 | baseTime= times (timer);
|
---|
291 | tree=constructWordList(freqList, allVectors, vectorCount);
|
---|
292 | wordListTime= times(timer);
|
---|
293 |
|
---|
294 | arraySize = fillInterestArray(tree, MAX_IWORDS, 0,itemArray);
|
---|
295 | vapeTree(tree);
|
---|
296 |
|
---|
297 | for (c=0;c<arraySize;c++)
|
---|
298 | {
|
---|
299 | transTable[c].newColumn=c;
|
---|
300 | transTable[c].itemPos=itemArray[c].itemPos;
|
---|
301 | }
|
---|
302 |
|
---|
303 | qsort((void *)transTable,arraySize,sizeof(TranslationItem),cmpTransTable);
|
---|
304 | buildMatrix( transTable, arraySize,allVectors,vectorCount, matrix);
|
---|
305 |
|
---|
306 | //remove empty rows
|
---|
307 | fflush(stderr);
|
---|
308 | vectorCount= removeEmptyRows(&matrix,&allVectors,&descriptions,vectorCount);
|
---|
309 | fflush(stderr);
|
---|
310 |
|
---|
311 | *buffer = '\0';
|
---|
312 |
|
---|
313 | n = sprintf(bbuf,"%d\n",vectorCount);
|
---|
314 | buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
315 |
|
---|
316 | for(c=0;c<vectorCount;c++)
|
---|
317 | {
|
---|
318 | n=sprintf(bbuf,"%d\n",allVectors[c].docNum);
|
---|
319 | appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
320 | }
|
---|
321 |
|
---|
322 | fprintf(stderr,"Buffer: %s\n",buffer);
|
---|
323 | // print interesting words
|
---|
324 | n= sprintf(bbuf,"%d\n",arraySize);
|
---|
325 | buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
326 |
|
---|
327 | for(c=0;c<arraySize;c++)
|
---|
328 | {
|
---|
329 | fseek(binndx,itemArray[c].itemPos*sizeof(size_t),SEEK_SET);
|
---|
330 | fread(&offset,sizeof(size_t),1,binndx);
|
---|
331 | fseek(words,offset,SEEK_SET);
|
---|
332 | fgets(buff,STRING_BUFF_LEN,words);
|
---|
333 | buffer = appendString(buffer,&cursize,&curptr,buff,strlen(buff));
|
---|
334 | //printf("%s",buff);
|
---|
335 | }
|
---|
336 |
|
---|
337 | // print matrix of words in documents
|
---|
338 | n= sprintf(bbuf,"%d\n",vectorCount);
|
---|
339 | buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
340 |
|
---|
341 | for(c=0;c<vectorCount;c++)
|
---|
342 | {
|
---|
343 | current=matrix[c];
|
---|
344 | while(current)
|
---|
345 | {
|
---|
346 | n=sprintf(bbuf,"%d,%d%s",current->freq,current->column,current->next?",":"");
|
---|
347 | buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
|
---|
348 | current=current->next;
|
---|
349 | }
|
---|
350 | buffer = appendString(buffer,&cursize,&curptr,"\n",1);
|
---|
351 | }
|
---|
352 |
|
---|
353 | fflush(stdout);
|
---|
354 |
|
---|
355 | // calculate and print clusters and sammon mapping
|
---|
356 | clusterNum = vectorCount>20?10:vectorCount/2;
|
---|
357 |
|
---|
358 | fprintf(stderr,"Cluster number: %d\n",clusterNum);
|
---|
359 |
|
---|
360 | limit = vectorCount>20?20:(vectorCount/2)+(vectorCount/4);
|
---|
361 |
|
---|
362 | if (vectorCount > 6)
|
---|
363 | {
|
---|
364 | clustering(COMPLETE_LINKAGE,matrix,vectorCount,arraySize,1,clusterNum,limit,buffer,&cursize,&curptr);
|
---|
365 | }
|
---|
366 | else if (vectorCount > 0)
|
---|
367 | {
|
---|
368 | printf("1\n");
|
---|
369 | for (c=0;c<vectorCount;c++){
|
---|
370 | printf("%d%s",c,c>=(vectorCount-1)?"\n":",");
|
---|
371 | fprintf(stderr,"%d%s",c,c>=(vectorCount-1)?"\n":",");
|
---|
372 | }
|
---|
373 | printf("1\n0.5,0.5\n");
|
---|
374 | }
|
---|
375 | else printf("0\n0\n");
|
---|
376 |
|
---|
377 | for(c=0;c<vectorCount;c++)
|
---|
378 | {
|
---|
379 | current=matrix[c];
|
---|
380 | while(current)
|
---|
381 | {
|
---|
382 | last=current;
|
---|
383 | current=current->next;
|
---|
384 | free(last);
|
---|
385 | }
|
---|
386 | free(descriptions[c]);
|
---|
387 | free(allVectors[c].data);
|
---|
388 | }
|
---|
389 |
|
---|
390 | fclose(words);
|
---|
391 | fclose(index);
|
---|
392 | fclose(binndx);
|
---|
393 | fclose(data);
|
---|
394 |
|
---|
395 | free(allVectors);
|
---|
396 | free(descriptions);
|
---|
397 | free(itemArray);
|
---|
398 | free(transTable);
|
---|
399 | free(matrix);
|
---|
400 | free(freqList);
|
---|
401 | free(timer);
|
---|
402 | }
|
---|
403 |
|
---|
404 |
|
---|
405 | int cmpTransTable(const void *a,const void *b)
|
---|
406 | {
|
---|
407 | TranslationItemPtr aT, bT;
|
---|
408 | aT = (TranslationItemPtr) a;
|
---|
409 | bT = (TranslationItemPtr) b;
|
---|
410 | return aT->itemPos - bT->itemPos;
|
---|
411 | }
|
---|
412 |
|
---|
413 |
|
---|
414 | size_t getDocNum(FILE *fp, char *buff)
|
---|
415 | {
|
---|
416 | char *p;
|
---|
417 | int c;
|
---|
418 | size_t result;
|
---|
419 | do
|
---|
420 | {
|
---|
421 | c = fgetc(fp);
|
---|
422 |
|
---|
423 | } while (!isdigit(c)&& !feof(fp));
|
---|
424 |
|
---|
425 | p=buff;
|
---|
426 |
|
---|
427 | while (isdigit(c)&& !feof(fp))
|
---|
428 | {
|
---|
429 | *(p++) = c;
|
---|
430 | c = fgetc(fp);
|
---|
431 | }
|
---|
432 | *p=0;
|
---|
433 | result= atoi(buff);
|
---|
434 | if(result)
|
---|
435 | {
|
---|
436 | for (c=0;c<DESCRIPTION_SIZE && !feof(fp);c++)
|
---|
437 | {
|
---|
438 | buff[c]=fgetc(fp);
|
---|
439 | if(buff[c]==2)
|
---|
440 | {
|
---|
441 | buff[c]=0;
|
---|
442 | break;
|
---|
443 | }
|
---|
444 |
|
---|
445 | }
|
---|
446 | }
|
---|
447 | return result;
|
---|
448 | }
|
---|
449 |
|
---|
450 |
|
---|
451 | size_t fileSize(FILE *fp)
|
---|
452 | {
|
---|
453 | size_t len;
|
---|
454 | fseek(fp,0,SEEK_END);
|
---|
455 | len=ftell(fp);
|
---|
456 | rewind(fp);
|
---|
457 | return len;
|
---|
458 | }
|
---|
459 |
|
---|
460 |
|
---|
461 | size_t fillAllVectors(DocRec *allVectors, int* docs, int length /*FILE *inFp*/, FILE *docTableFp,
|
---|
462 | FILE *dtNdxFp, size_t maxDoc,char **descrip)
|
---|
463 | {
|
---|
464 | size_t tableLen, recCount, vectorCount=0,docNum=1;
|
---|
465 |
|
---|
466 | tableLen=fileSize(docTableFp);
|
---|
467 |
|
---|
468 | while (docNum > 0 && vectorCount < length /*&& !feof(inFp)*/ && vectorCount < MAX_DOCUMENTS && maxDoc!=0)
|
---|
469 | {
|
---|
470 | descrip[vectorCount]=malloc(DESCRIPTION_SIZE);
|
---|
471 | assert(descrip[vectorCount]);
|
---|
472 | //docNum=getDocNum(inFp, descrip[vectorCount]);
|
---|
473 | docNum = docs[vectorCount];
|
---|
474 | if(docNum>0)
|
---|
475 | {
|
---|
476 | allVectors[vectorCount].data=getIntVector(docNum,docTableFp,
|
---|
477 | dtNdxFp, maxDoc, tableLen, &recCount);
|
---|
478 |
|
---|
479 | allVectors[vectorCount].count=recCount;
|
---|
480 | allVectors[vectorCount].docNum=docNum;
|
---|
481 | vectorCount++;
|
---|
482 | }
|
---|
483 | else free(descrip[vectorCount]);
|
---|
484 | }
|
---|
485 | //while(!feof(inFp)) fgetc(inFp);
|
---|
486 | return vectorCount;
|
---|
487 | }
|
---|
488 |
|
---|
489 |
|
---|
490 | HjTreePtr constructWordList(size_t *freqList,DocRec *allVectors, size_t vectorSize)
|
---|
491 | {
|
---|
492 | HjTreePtr tree=NULL,newTree=NULL;
|
---|
493 | size_t dj, c,d, e, m,n,pos;
|
---|
494 |
|
---|
495 | for(c=0;c<vectorSize;c++)
|
---|
496 | {
|
---|
497 | n=allVectors[c].count;
|
---|
498 | m=n/2;
|
---|
499 | for(d=m,e=m-1;d<n;d++,e--)
|
---|
500 | {
|
---|
501 |
|
---|
502 | //allVectors[c].data[d].pos;
|
---|
503 | //this is an insane device to get a flatter tree from already ordered data by
|
---|
504 | //jumbling up the order that it is added to the tree
|
---|
505 |
|
---|
506 | if(e>0)
|
---|
507 | {
|
---|
508 | pos=allVectors[c].data[e].pos;
|
---|
509 | dj=freqList[pos];
|
---|
510 | tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
|
---|
511 |
|
---|
512 | pos=allVectors[c].data[d].pos;
|
---|
513 | dj=freqList[pos];
|
---|
514 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
515 | }
|
---|
516 | if(e==0)
|
---|
517 | {
|
---|
518 | pos=allVectors[c].data[e].pos;
|
---|
519 | dj=freqList[pos];
|
---|
520 | tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
|
---|
521 |
|
---|
522 | pos=allVectors[c].data[d].pos;
|
---|
523 | dj=freqList[pos];
|
---|
524 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
525 |
|
---|
526 | if( n % 2 == 1 )
|
---|
527 | {
|
---|
528 | pos=allVectors[c].data[++d].pos;
|
---|
529 | dj=freqList[pos];
|
---|
530 | tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
|
---|
531 | break;
|
---|
532 | }
|
---|
533 |
|
---|
534 |
|
---|
535 | }
|
---|
536 |
|
---|
537 | }
|
---|
538 | }
|
---|
539 |
|
---|
540 | // get candidate keyword weights
|
---|
541 | calcInterestVal(tree,vectorSize);
|
---|
542 | newTree=treeByInterestVal(&tree,newTree);
|
---|
543 | return newTree;
|
---|
544 |
|
---|
545 | }
|
---|
546 |
|
---|
547 |
|
---|
548 | HjTreePtr buildHjTree(HjTreePtr tree, DocData *item, size_t freq)
|
---|
549 | {
|
---|
550 | HjTreePtr root=tree, /*lastNode,*/ *treePtr;
|
---|
551 | treePtr = &tree;
|
---|
552 | while(1)
|
---|
553 | {
|
---|
554 | if(*treePtr==NULL)
|
---|
555 | {
|
---|
556 | *treePtr=malloc(sizeof(HjTree));
|
---|
557 | assert (*treePtr);
|
---|
558 | (*treePtr)->less=NULL;
|
---|
559 | (*treePtr)->more=NULL;
|
---|
560 | (*treePtr)->dj=freq;
|
---|
561 | (*treePtr)->hj=1;
|
---|
562 | (*treePtr)->pos=item->pos;
|
---|
563 | (*treePtr)->interestVal=0.0;
|
---|
564 | if (root==NULL) root=(*treePtr);
|
---|
565 | tree = (*treePtr);
|
---|
566 | break;
|
---|
567 | }
|
---|
568 | else
|
---|
569 | {
|
---|
570 | if(tree->pos==item->pos)
|
---|
571 | {
|
---|
572 | tree->hj++;
|
---|
573 | break;
|
---|
574 | }
|
---|
575 | else if((*treePtr)->dj < freq)
|
---|
576 | {
|
---|
577 | treePtr = &(tree->more);
|
---|
578 | tree = (*treePtr);
|
---|
579 | }
|
---|
580 | else
|
---|
581 | {
|
---|
582 | treePtr= &(tree->less);
|
---|
583 | tree = (*treePtr);
|
---|
584 | }
|
---|
585 | }
|
---|
586 | }
|
---|
587 | return root;
|
---|
588 | }
|
---|
589 |
|
---|
590 | void calcInterestVal(HjTreePtr tree,size_t vectorSize)
|
---|
591 | {
|
---|
592 | size_t hj,dj;
|
---|
593 |
|
---|
594 | //fprintf(stderr, "calcInterestVal\n");
|
---|
595 | while(tree)
|
---|
596 | {
|
---|
597 | hj=tree->hj;
|
---|
598 | dj=tree->dj;
|
---|
599 | if(tree->more)
|
---|
600 | calcInterestVal(tree->more, vectorSize);
|
---|
601 | tree->interestVal=((double)hj/(double)dj)*hj*log((double)vectorSize/hj);
|
---|
602 | tree=tree->less;
|
---|
603 | }
|
---|
604 | }
|
---|
605 |
|
---|
606 | HjTreePtr getLeaf(HjTreePtr *tree)
|
---|
607 | {
|
---|
608 | HjTreePtr result;
|
---|
609 | if((*tree)->more) return getLeaf(&((*tree)->more));
|
---|
610 | if((*tree)->less) return getLeaf(&((*tree)->less));
|
---|
611 | else result=*tree;
|
---|
612 | *tree=NULL;
|
---|
613 | return result;
|
---|
614 | }
|
---|
615 |
|
---|
616 | HjTreePtr insertByInterestVal(HjTreePtr tree,HjTreePtr node)
|
---|
617 | {
|
---|
618 | if(tree==NULL)
|
---|
619 | tree=node;
|
---|
620 | else if(tree->interestVal < node->interestVal)
|
---|
621 | tree->more=insertByInterestVal(tree->more,node);
|
---|
622 | else
|
---|
623 | tree->less=insertByInterestVal(tree->less,node);
|
---|
624 | return tree;
|
---|
625 | }
|
---|
626 |
|
---|
627 | HjTreePtr treeByInterestVal(HjTreePtr *tree, HjTreePtr newTree)
|
---|
628 | {
|
---|
629 | HjTreePtr node;
|
---|
630 | while(*tree)
|
---|
631 | {
|
---|
632 | node=getLeaf(tree);
|
---|
633 | if(node)newTree=insertByInterestVal(newTree, node);
|
---|
634 | }
|
---|
635 | return newTree;
|
---|
636 | }
|
---|
637 |
|
---|
638 |
|
---|
639 | size_t fillInterestArray(HjTreePtr tree, size_t max, size_t count,
|
---|
640 | InterestItem *itemArray)
|
---|
641 | {
|
---|
642 | while(tree)
|
---|
643 | {
|
---|
644 | if(count==max) return count;
|
---|
645 | if(tree->more)
|
---|
646 | count=fillInterestArray(tree->more,max, count,itemArray );
|
---|
647 | if(count==max) return count;
|
---|
648 | itemArray[count].interestVal = tree->interestVal;
|
---|
649 | itemArray[count].itemPos = tree->pos;
|
---|
650 | itemArray[count].dj = tree->dj;
|
---|
651 | count++;
|
---|
652 | tree=tree->less;
|
---|
653 | }
|
---|
654 | return count;
|
---|
655 | }
|
---|
656 |
|
---|
657 | /*
|
---|
658 | size_t printTree(HjTreePtr tree, size_t max, size_t count,
|
---|
659 | FILE *binFp,FILE *wordsFp)
|
---|
660 | {
|
---|
661 | while(tree)
|
---|
662 | {
|
---|
663 | if(count==max) return count;
|
---|
664 | if(tree->more)
|
---|
665 | count=printTree(tree->more,max, count,binFp,wordsFp);
|
---|
666 | if(count==max) return count;
|
---|
667 | printTreeItem(tree,binFp,wordsFp);
|
---|
668 | count++;
|
---|
669 | tree=tree->less;
|
---|
670 | }
|
---|
671 | return count;
|
---|
672 | }
|
---|
673 |
|
---|
674 | void printTreeItem(HjTreePtr tree,FILE *binFp,FILE *wordsFp)
|
---|
675 | {
|
---|
676 | char buff[STRING_BUFF_LEN];
|
---|
677 | size_t offset;
|
---|
678 | fseek(binFp,tree->pos*sizeof(size_t),SEEK_SET);
|
---|
679 | fread(&offset,sizeof(size_t),1,binFp);
|
---|
680 | fseek(wordsFp,offset,SEEK_SET);
|
---|
681 | fgets(buff,STRING_BUFF_LEN,wordsFp);
|
---|
682 | printf("%6d %6d %4.8f %s",tree->dj, tree->hj, tree->interestVal,buff);
|
---|
683 | }
|
---|
684 | */
|
---|
685 |
|
---|
686 | void vapeTree(HjTreePtr tree)
|
---|
687 | {
|
---|
688 | if(!tree)return;
|
---|
689 | vapeTree(tree->more);
|
---|
690 | vapeTree(tree->less);
|
---|
691 | free(tree);
|
---|
692 | }
|
---|
693 |
|
---|
694 |
|
---|