source: trunk/greenstone3-extensions/vishnu/src/cluster/procresults.c@ 8189

Last change on this file since 8189 was 8189, checked in by kjdon, 20 years ago

first version of Imperial College's Visualiser code

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1#include <jni.h>
2#include <stdio.h>
3#include <sys/types.h>
4#include <unistd.h>
5#include <sys/wait.h>
6#include <stdlib.h>
7#include <string.h>
8#include <assert.h>
9#include <math.h>
10#include <ctype.h>
11#include <sys/times.h>
12#ifndef CLK_TCK
13#include <time.h>
14#endif
15//#include "Cluster.h"
16#include "vishnu_server_Cluster.h"
17#include "procresults.h"
18#include "clustering.h"
19
20char slash='/';
21
22
23JNIEXPORT jstring JNICALL Java_vishnu_server_Cluster_getCC(JNIEnv * env,jobject thisObj,
24 jintArray arr, jstring collection_home)
25{
26 fprintf(stderr,"in getCC");
27 int i;
28 jstring jresult;
29 char * buffer = (char *) malloc(20000);
30 const jbyte * coll_byte;
31 char * coll_home;
32 int docs[500];
33
34 jsize len = (*env)->GetArrayLength(env, arr);
35 jint *body = (*env)->GetIntArrayElements(env, arr, 0);
36
37 coll_byte = (*env)->GetStringUTFChars(env, collection_home, NULL);
38 if( coll_byte == NULL )
39 {
40 fprintf(stderr,"getCC: colleciton_home is null, returning null");
41 return NULL;
42 }
43 coll_home = coll_byte;
44
45 for( i = 0; i < len; i++ ) docs[i] = body[i];
46
47 mediate(docs,len,buffer, coll_home);
48 jresult = (*env)->NewStringUTF(env, buffer);
49
50 (*env)->ReleaseStringUTFChars(env, collection_home, coll_byte);
51 (*env)->ReleaseIntArrayElements(env,arr,body,0);
52 return jresult;
53}
54
55
56void mediate(int* docs, int length, char *buffer, char *coll_home)
57{
58 work(docs,length,buffer,coll_home);
59 return;
60}
61
62
63DocData *getIntVector(size_t docToRead,FILE *docTableFp,
64 FILE *dtNdxFp, size_t maxDoc, size_t tableLen,
65 size_t *recCount)
66{
67 size_t recSize, docInf,docNext,c;
68 DocData *data;
69 recSize=sizeof(size_t)+sizeof(short);
70
71 fseek(dtNdxFp,(docToRead-1)*sizeof(size_t),SEEK_SET);
72 fread(&docInf,sizeof(size_t),1,dtNdxFp);
73
74 if(docToRead==1)
75 docInf=0;
76
77 fseek(docTableFp,docInf,SEEK_SET);
78
79 if(docToRead<maxDoc)fread(&docNext,sizeof(size_t),1,dtNdxFp);
80 else docNext=tableLen;
81
82
83
84 *recCount=(docNext-docInf)/recSize;
85 data=(DocData *)malloc(*recCount*sizeof(DocData));
86
87 assert(data);
88 for(c=0;c < *recCount;c++)
89 {
90 fread(&data[c],recSize,1,docTableFp);
91 if(feof(docTableFp)) break;
92 }
93 return data;
94}
95
96void buildMatrix(TranslationItemPtr transTable,
97 size_t arraySize,DocRec *allVectors,
98 size_t vectorCount, MatrixNodePtr *matrix)
99{
100 MatrixNodePtr newNode;
101 size_t c,d,e;
102 for(c=0;c<vectorCount;c++)
103 {
104 e=0;
105 matrix[c]=NULL;
106 for(d=0;d<allVectors[c].count && e < arraySize;d++)
107 {
108
109 while(allVectors[c].data[d].pos>transTable[e].itemPos && e < arraySize)
110 {
111 e++;
112 }
113 if(e < arraySize && allVectors[c].data[d].pos==transTable[e].itemPos)
114 {
115 /* this for straight text output
116 can't do this as some vectors are empty
117 printf("%s%d,%d",matrix[c]==NULL?"":",",
118 transTable[e].newColumn,allVectors[c].data[d].freq);
119 this is for a jni interface? */
120 newNode=malloc(sizeof(MatrixNode));
121 assert(newNode);
122 newNode->next=matrix[c];
123 newNode->column=transTable[e].newColumn;
124 newNode->freq=allVectors[c].data[d].freq;
125 matrix[c]=newNode;
126 }
127
128
129 }
130 //printf("\n");
131 }
132}
133
134
135int removeEmptyRows(MatrixNodePtr **matrixPtr, DocRec **allVectorsPtr,char ***descriptionsPtr,int vectorCount)
136{
137 DocRec *newAllVectors;
138 MatrixNodePtr *newMatrix, current;
139 char **newDescriptions;
140 int c,d;
141
142 newAllVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
143 assert(newAllVectors);
144 newMatrix = calloc(MAX_DOCUMENTS,sizeof(MatrixNodePtr));
145 assert(newMatrix);
146 newDescriptions=calloc(MAX_DOCUMENTS,sizeof(char *));
147 assert(newDescriptions);
148 // for(d=0,c=0;c<vectorCount;c++)
149 // fprintf(stderr,"current %p\n",(*matrixPtr)[c]);
150 for(d=0,c=0;c<vectorCount;c++)
151 {
152 current=(*matrixPtr)[c];
153 // fprintf(stderr,"vector %d\n",c);
154 fflush(stderr);
155 if(current != NULL)
156 {
157 // fprintf(stderr,"current %p allvectors %p descriptions %p\n",current,(*allVectorsPtr)[c],(*descriptionsPtr)[c]);
158 newMatrix[d] = current;
159 newAllVectors[d] = (*allVectorsPtr)[c];
160 newDescriptions[d] = (*descriptionsPtr)[c];
161
162 // fprintf(stderr,"copied to new vector %d\n",d);
163 // fflush(stderr);
164 d++;
165 }
166 else
167 {
168 free((*allVectorsPtr)[c].data);
169 free((*descriptionsPtr)[c]);
170 }
171 }
172 free(*matrixPtr);
173 free(*descriptionsPtr);
174 free(*allVectorsPtr);
175 *matrixPtr = newMatrix;
176 *descriptionsPtr=newDescriptions;
177 *allVectorsPtr=newAllVectors;
178 return d;
179}
180
181
182int main(int argc, char **argv)
183{
184 return 0;
185}
186
187
188char * appendString(char * buffer, int * cursize, int * curptr, char *data, int len)
189{
190 if (!len) return (buffer);
191 if (len + *curptr + 1 > *cursize) {
192 fflush(stderr);
193 while (len + *curptr + 1 > *cursize) *cursize += 256;
194 buffer = (char *) realloc ((void *) buffer, *cursize);
195 }
196 strncpy (buffer + *curptr, data, len);
197 *curptr += len;
198 buffer[*curptr] = '\0';
199 return (buffer);
200}
201
202void work(int *docNums, int numDocs, char * buffer, char *coll_home)
203{
204 time_t baseTime,wordListTime;
205 struct tms *timer;
206
207 DocRec *allVectors;
208 InterestItem *itemArray;
209 TranslationItem *transTable;
210 MatrixNodePtr *matrix,current,last;
211 char **descriptions;
212
213 char bbuf[40];
214 int cursize = 20000;
215 int curptr = 0;
216
217 int clusterNum = 10;
218 int limit = 20;
219
220 size_t arraySize;
221 size_t maxDoc;
222 FILE *index;
223 FILE *data;
224 FILE *words;
225 FILE *binndx;
226 FILE *freqs;
227 char *dir, indexName[STRING_BUFF_LEN],dataName[STRING_BUFF_LEN],wordsName[STRING_BUFF_LEN],
228 binndxName[STRING_BUFF_LEN],freqName[STRING_BUFF_LEN], buff[STRING_BUFF_LEN],
229 indexDir[STRING_BUFF_LEN];
230
231 size_t *freqList,wordCount,c;
232 size_t n,vectorCount, offset;
233 HjTreePtr tree;
234
235 timer = (struct tms *) malloc(sizeof (struct tms));
236
237 fprintf(stderr,"About to get into ck data\n");
238
239 allVectors = calloc(MAX_DOCUMENTS,sizeof(DocRec));
240 assert(allVectors);
241 itemArray = calloc(MAX_IWORDS,sizeof(InterestItem));
242 assert(itemArray);
243 transTable= calloc(MAX_IWORDS,sizeof(TranslationItem));
244 assert(transTable);
245 matrix = calloc(MAX_DOCUMENTS,sizeof(MatrixNodePtr));
246 assert(matrix);
247 descriptions=calloc(MAX_DOCUMENTS,sizeof(char *));
248 assert(descriptions);
249
250 //dir = "\0";
251 //dir = getenv("GSDL3HOME");
252 //sprintf(indexDir,"%s%cweb/sites/localsite/collect/%s/index/ck_index",dir,slash,coll);
253 sprintf(indexDir, "%sindex%cck_index", coll_home, slash);
254 //dir = getenv("GSDL3HOME");
255 //fprintf(stderr,dir);
256
257 sprintf(indexName,"%s%cdoctable.ndx",indexDir,slash);
258 sprintf(dataName,"%s%cdoctable.data",indexDir,slash);
259 sprintf(wordsName,"%s%cnewinterest.txt",indexDir,slash);
260 sprintf(binndxName,"%s%cnewbnndexFn.ndx",indexDir,slash);
261 sprintf(freqName,"%s%cnewdocFreq.dat",indexDir,slash);
262
263 fprintf(stderr,indexName);
264
265 index = fopen(indexName,"rb");
266 assert(index);
267 data = fopen(dataName,"rb");
268 assert(data);
269 words=fopen(wordsName,"r");
270 assert(words);
271 binndx=fopen(binndxName,"rb");
272 assert(binndx);
273 freqs=fopen(freqName,"rb");
274 assert(freqs);
275
276 wordCount=fileSize(binndx)/sizeof(size_t);
277 fread(&maxDoc,sizeof(size_t),1,index);
278 freqList=malloc(wordCount*sizeof(size_t));
279 assert(freqList);
280
281 // reads at most wordCount elements of size size_t into freqList
282 n=fread(freqList,sizeof(size_t),wordCount,freqs);
283 if(n != wordCount) fprintf(stderr,"n %d wordCount %d\n",n,wordCount);
284 assert(n==wordCount);
285 fclose(freqs);
286
287 vectorCount=fillAllVectors(allVectors, docNums, numDocs, data,index,maxDoc, descriptions);
288
289 // begin processing results
290 baseTime= times (timer);
291 tree=constructWordList(freqList, allVectors, vectorCount);
292 wordListTime= times(timer);
293
294 arraySize = fillInterestArray(tree, MAX_IWORDS, 0,itemArray);
295 vapeTree(tree);
296
297 for (c=0;c<arraySize;c++)
298 {
299 transTable[c].newColumn=c;
300 transTable[c].itemPos=itemArray[c].itemPos;
301 }
302
303 qsort((void *)transTable,arraySize,sizeof(TranslationItem),cmpTransTable);
304 buildMatrix( transTable, arraySize,allVectors,vectorCount, matrix);
305
306 //remove empty rows
307 fflush(stderr);
308 vectorCount= removeEmptyRows(&matrix,&allVectors,&descriptions,vectorCount);
309 fflush(stderr);
310
311 *buffer = '\0';
312
313 n = sprintf(bbuf,"%d\n",vectorCount);
314 buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
315
316 for(c=0;c<vectorCount;c++)
317 {
318 n=sprintf(bbuf,"%d\n",allVectors[c].docNum);
319 appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
320 }
321
322 fprintf(stderr,"Buffer: %s\n",buffer);
323 // print interesting words
324 n= sprintf(bbuf,"%d\n",arraySize);
325 buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
326
327 for(c=0;c<arraySize;c++)
328 {
329 fseek(binndx,itemArray[c].itemPos*sizeof(size_t),SEEK_SET);
330 fread(&offset,sizeof(size_t),1,binndx);
331 fseek(words,offset,SEEK_SET);
332 fgets(buff,STRING_BUFF_LEN,words);
333 buffer = appendString(buffer,&cursize,&curptr,buff,strlen(buff));
334 //printf("%s",buff);
335 }
336
337 // print matrix of words in documents
338 n= sprintf(bbuf,"%d\n",vectorCount);
339 buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
340
341 for(c=0;c<vectorCount;c++)
342 {
343 current=matrix[c];
344 while(current)
345 {
346 n=sprintf(bbuf,"%d,%d%s",current->freq,current->column,current->next?",":"");
347 buffer = appendString(buffer,&cursize,&curptr,bbuf,strlen(bbuf));
348 current=current->next;
349 }
350 buffer = appendString(buffer,&cursize,&curptr,"\n",1);
351 }
352
353 fflush(stdout);
354
355 // calculate and print clusters and sammon mapping
356 clusterNum = vectorCount>20?10:vectorCount/2;
357
358 fprintf(stderr,"Cluster number: %d\n",clusterNum);
359
360 limit = vectorCount>20?20:(vectorCount/2)+(vectorCount/4);
361
362 if (vectorCount > 6)
363 {
364 clustering(COMPLETE_LINKAGE,matrix,vectorCount,arraySize,1,clusterNum,limit,buffer,&cursize,&curptr);
365 }
366 else if (vectorCount > 0)
367 {
368 printf("1\n");
369 for (c=0;c<vectorCount;c++){
370 printf("%d%s",c,c>=(vectorCount-1)?"\n":",");
371 fprintf(stderr,"%d%s",c,c>=(vectorCount-1)?"\n":",");
372 }
373 printf("1\n0.5,0.5\n");
374 }
375 else printf("0\n0\n");
376
377 for(c=0;c<vectorCount;c++)
378 {
379 current=matrix[c];
380 while(current)
381 {
382 last=current;
383 current=current->next;
384 free(last);
385 }
386 free(descriptions[c]);
387 free(allVectors[c].data);
388 }
389
390 fclose(words);
391 fclose(index);
392 fclose(binndx);
393 fclose(data);
394
395 free(allVectors);
396 free(descriptions);
397 free(itemArray);
398 free(transTable);
399 free(matrix);
400 free(freqList);
401 free(timer);
402}
403
404
405int cmpTransTable(const void *a,const void *b)
406{
407 TranslationItemPtr aT, bT;
408 aT = (TranslationItemPtr) a;
409 bT = (TranslationItemPtr) b;
410 return aT->itemPos - bT->itemPos;
411}
412
413
414size_t getDocNum(FILE *fp, char *buff)
415{
416 char *p;
417 int c;
418 size_t result;
419 do
420 {
421 c = fgetc(fp);
422
423 } while (!isdigit(c)&& !feof(fp));
424
425 p=buff;
426
427 while (isdigit(c)&& !feof(fp))
428 {
429 *(p++) = c;
430 c = fgetc(fp);
431 }
432 *p=0;
433 result= atoi(buff);
434 if(result)
435 {
436 for (c=0;c<DESCRIPTION_SIZE && !feof(fp);c++)
437 {
438 buff[c]=fgetc(fp);
439 if(buff[c]==2)
440 {
441 buff[c]=0;
442 break;
443 }
444
445 }
446 }
447 return result;
448}
449
450
451size_t fileSize(FILE *fp)
452{
453 size_t len;
454 fseek(fp,0,SEEK_END);
455 len=ftell(fp);
456 rewind(fp);
457 return len;
458}
459
460
461size_t fillAllVectors(DocRec *allVectors, int* docs, int length /*FILE *inFp*/, FILE *docTableFp,
462 FILE *dtNdxFp, size_t maxDoc,char **descrip)
463{
464 size_t tableLen, recCount, vectorCount=0,docNum=1;
465
466 tableLen=fileSize(docTableFp);
467
468 while (docNum > 0 && vectorCount < length /*&& !feof(inFp)*/ && vectorCount < MAX_DOCUMENTS && maxDoc!=0)
469 {
470 descrip[vectorCount]=malloc(DESCRIPTION_SIZE);
471 assert(descrip[vectorCount]);
472 //docNum=getDocNum(inFp, descrip[vectorCount]);
473 docNum = docs[vectorCount];
474 if(docNum>0)
475 {
476 allVectors[vectorCount].data=getIntVector(docNum,docTableFp,
477 dtNdxFp, maxDoc, tableLen, &recCount);
478
479 allVectors[vectorCount].count=recCount;
480 allVectors[vectorCount].docNum=docNum;
481 vectorCount++;
482 }
483 else free(descrip[vectorCount]);
484 }
485 //while(!feof(inFp)) fgetc(inFp);
486 return vectorCount;
487}
488
489
490HjTreePtr constructWordList(size_t *freqList,DocRec *allVectors, size_t vectorSize)
491{
492 HjTreePtr tree=NULL,newTree=NULL;
493 size_t dj, c,d, e, m,n,pos;
494
495 for(c=0;c<vectorSize;c++)
496 {
497 n=allVectors[c].count;
498 m=n/2;
499 for(d=m,e=m-1;d<n;d++,e--)
500 {
501
502 //allVectors[c].data[d].pos;
503 //this is an insane device to get a flatter tree from already ordered data by
504 //jumbling up the order that it is added to the tree
505
506 if(e>0)
507 {
508 pos=allVectors[c].data[e].pos;
509 dj=freqList[pos];
510 tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
511
512 pos=allVectors[c].data[d].pos;
513 dj=freqList[pos];
514 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
515 }
516 if(e==0)
517 {
518 pos=allVectors[c].data[e].pos;
519 dj=freqList[pos];
520 tree=buildHjTree(tree,&(allVectors[c].data[e]), dj);
521
522 pos=allVectors[c].data[d].pos;
523 dj=freqList[pos];
524 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
525
526 if( n % 2 == 1 )
527 {
528 pos=allVectors[c].data[++d].pos;
529 dj=freqList[pos];
530 tree=buildHjTree(tree,&(allVectors[c].data[d]), dj);
531 break;
532 }
533
534
535 }
536
537 }
538 }
539
540 // get candidate keyword weights
541 calcInterestVal(tree,vectorSize);
542 newTree=treeByInterestVal(&tree,newTree);
543 return newTree;
544
545}
546
547
548HjTreePtr buildHjTree(HjTreePtr tree, DocData *item, size_t freq)
549{
550 HjTreePtr root=tree, /*lastNode,*/ *treePtr;
551 treePtr = &tree;
552 while(1)
553 {
554 if(*treePtr==NULL)
555 {
556 *treePtr=malloc(sizeof(HjTree));
557 assert (*treePtr);
558 (*treePtr)->less=NULL;
559 (*treePtr)->more=NULL;
560 (*treePtr)->dj=freq;
561 (*treePtr)->hj=1;
562 (*treePtr)->pos=item->pos;
563 (*treePtr)->interestVal=0.0;
564 if (root==NULL) root=(*treePtr);
565 tree = (*treePtr);
566 break;
567 }
568 else
569 {
570 if(tree->pos==item->pos)
571 {
572 tree->hj++;
573 break;
574 }
575 else if((*treePtr)->dj < freq)
576 {
577 treePtr = &(tree->more);
578 tree = (*treePtr);
579 }
580 else
581 {
582 treePtr= &(tree->less);
583 tree = (*treePtr);
584 }
585 }
586 }
587 return root;
588}
589
590void calcInterestVal(HjTreePtr tree,size_t vectorSize)
591{
592 size_t hj,dj;
593
594 //fprintf(stderr, "calcInterestVal\n");
595 while(tree)
596 {
597 hj=tree->hj;
598 dj=tree->dj;
599 if(tree->more)
600 calcInterestVal(tree->more, vectorSize);
601 tree->interestVal=((double)hj/(double)dj)*hj*log((double)vectorSize/hj);
602 tree=tree->less;
603 }
604}
605
606HjTreePtr getLeaf(HjTreePtr *tree)
607{
608 HjTreePtr result;
609 if((*tree)->more) return getLeaf(&((*tree)->more));
610 if((*tree)->less) return getLeaf(&((*tree)->less));
611 else result=*tree;
612 *tree=NULL;
613 return result;
614}
615
616HjTreePtr insertByInterestVal(HjTreePtr tree,HjTreePtr node)
617{
618 if(tree==NULL)
619 tree=node;
620 else if(tree->interestVal < node->interestVal)
621 tree->more=insertByInterestVal(tree->more,node);
622 else
623 tree->less=insertByInterestVal(tree->less,node);
624 return tree;
625}
626
627HjTreePtr treeByInterestVal(HjTreePtr *tree, HjTreePtr newTree)
628{
629 HjTreePtr node;
630 while(*tree)
631 {
632 node=getLeaf(tree);
633 if(node)newTree=insertByInterestVal(newTree, node);
634 }
635 return newTree;
636}
637
638
639size_t fillInterestArray(HjTreePtr tree, size_t max, size_t count,
640 InterestItem *itemArray)
641{
642 while(tree)
643 {
644 if(count==max) return count;
645 if(tree->more)
646 count=fillInterestArray(tree->more,max, count,itemArray );
647 if(count==max) return count;
648 itemArray[count].interestVal = tree->interestVal;
649 itemArray[count].itemPos = tree->pos;
650 itemArray[count].dj = tree->dj;
651 count++;
652 tree=tree->less;
653 }
654 return count;
655}
656
657/*
658 size_t printTree(HjTreePtr tree, size_t max, size_t count,
659 FILE *binFp,FILE *wordsFp)
660 {
661 while(tree)
662 {
663 if(count==max) return count;
664 if(tree->more)
665 count=printTree(tree->more,max, count,binFp,wordsFp);
666 if(count==max) return count;
667 printTreeItem(tree,binFp,wordsFp);
668 count++;
669 tree=tree->less;
670 }
671 return count;
672 }
673
674 void printTreeItem(HjTreePtr tree,FILE *binFp,FILE *wordsFp)
675 {
676 char buff[STRING_BUFF_LEN];
677 size_t offset;
678 fseek(binFp,tree->pos*sizeof(size_t),SEEK_SET);
679 fread(&offset,sizeof(size_t),1,binFp);
680 fseek(wordsFp,offset,SEEK_SET);
681 fgets(buff,STRING_BUFF_LEN,wordsFp);
682 printf("%6d %6d %4.8f %s",tree->dj, tree->hj, tree->interestVal,buff);
683 }
684*/
685
686void vapeTree(HjTreePtr tree)
687{
688 if(!tree)return;
689 vapeTree(tree->more);
690 vapeTree(tree->less);
691 free(tree);
692}
693
694
Note: See TracBrowser for help on using the repository browser.