Changeset 8281 for trunk/gsdl3
- Timestamp:
- 2004-10-12T11:25:12+13:00 (20 years ago)
- Location:
- trunk/gsdl3
- Files:
-
- 2 deleted
- 24 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/AveLink.java
r8189 r8281 3 3 public class AveLink extends LinkFunc 4 4 { 5 double link(double a, doubleb)5 float link(float a, float b) 6 6 { 7 7 return (a+b)/2; -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Cluster.java
r8189 r8281 5 5 public class Cluster 6 6 { 7 public int _id;8 public Cluster _parent = null;9 public Cluster _child_1 = null;10 public Cluster _child_2 = null;11 12 public double _children_sim;13 14 // probably the size of thecluster15 public int _items;16 17 public int _split;18 7 public int id; 8 public Cluster parent = null; 9 public Cluster child_1 = null; 10 public Cluster child_2 = null; 11 12 public double children_sim; 13 14 // size of cluster 15 public int items; 16 17 public int split; 18 19 19 // [] -> centroid for each vector element ? 20 public double[] _centroid=null; 21 22 public Vector _iter_items = null; 23 public int _iter_items_num = 0; 24 25 public int _x = 0; 26 public int _y = 0; 27 public double[] getCentroid(){return _centroid;} 28 29 public Cluster(int id, Cluster parent, Cluster child_1, Cluster child_2, double children_sim, int items) 30 { 31 _id = id; 32 _children_sim = children_sim; 33 _items = items; 34 _split = 0; 35 _parent = parent; 36 _iter_items = new Vector(0); 20 public float[] centroid=null; 21 22 public Vector iter_items = null; 23 public int iter_items_num = 0; 24 25 public int x = 0; 26 public int y = 0; 27 28 public float[] getCentroid(){return centroid;} 29 30 public Cluster(int id, Cluster parent, Cluster c1, Cluster c2, double children_sim, int items) 31 { 32 this.id = id; 33 this.children_sim = children_sim; 34 this.items = items; 35 split = 0; 36 this.parent = parent; 37 iter_items = new Vector(0); 37 38 38 39 // probably to get a balanced tree 39 if (child_1 != null && child_2 != null) 40 { 41 if (child_2._items > child_1._items) 42 { 43 _child_1 = child_1; 44 _child_2 = child_2; 45 } 46 else 47 { 48 _child_1 = child_2; 49 _child_2 = child_1; 50 } 51 } 52 else // if either or both are null 53 { 54 _child_1 = child_1; 55 _child_2 = child_2; 40 if (c1 != null && c2 != null){ 41 if (c2.items > c1.items){ 42 child_1 = c1; 43 child_2 = c2; 44 }else{ 45 child_1 = c2; 46 child_2 = c1; 56 47 } 57 } 58 48 }else{ // if either or both are null 49 child_1 = c1; 50 child_2 = c2; 51 } 52 } 53 59 54 // a nice recursive function to determine the depth of the hierarchy 60 public int depth() 61 { 62 if (_id != -1) 63 return 0; 64 else 65 return Math.max(1 + _child_1.depth(), 1 + _child_2.depth()); 55 public int depth() 56 { 57 if (id != -1) 58 return 0; 59 else 60 return Math.max(1 + child_1.depth(), 1 + child_2.depth()); 61 } 62 63 64 void addChildrenToOutputVector(int []coding, Vector output) 65 { 66 if (id != -1){ 67 Integer ii = new Integer(coding[id]); 68 output.addElement(ii); 69 }else{ 70 child_1.addChildrenToOutputVector(coding,output); 71 child_2.addChildrenToOutputVector(coding,output); 66 72 } 67 68 void addChildrenToOutputVector(int []coding, Vector output)69 {70 if (_id != -1)71 {72 Integer ii = new Integer(coding[_id]);73 output.addElement(ii);74 75 }76 else77 {78 _child_1.addChildrenToOutputVector(coding,output);79 _child_2.addChildrenToOutputVector(coding,output);80 }81 73 } 82 83 74 } 84 75 -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/ClusterArray.java
r8189 r8281 6 6 public class ClusterArray 7 7 { 8 Vector contents; 9 10 ClusterArray(int num_of_clusters) 11 { 12 int i; 13 14 contents = new Vector(num_of_clusters); 15 16 for (i = 0; i < num_of_clusters; i++) 17 contents.addElement(new Cluster(i, null, null, null, 0, 1)); 18 // i.e. id = i, similarity between kids = 0, #items = 1 8 Vector contents; 9 10 ClusterArray(int num_of_clusters) 11 { 12 contents = new Vector(num_of_clusters); 13 14 for(int i = 0; i < num_of_clusters; i++) 15 contents.addElement(new Cluster(i, null, null, null, 0, 1)); 16 } 17 18 19 // counts the number of clusters that are less dissimilar than disim 20 private void sweep_tree (Cluster cluster, double disim, int min_c_items, int[] clusters) 21 { 22 cluster.split = 0; 23 24 if (cluster.id == -1){ 25 if (cluster.children_sim <= disim && cluster.items >= min_c_items){ 26 cluster.split = 1; 27 contents.addElement(cluster); 28 clusters[0]++; 29 }else{ 30 sweep_tree (cluster.child_1, disim, min_c_items, clusters); 31 sweep_tree (cluster.child_2, disim, min_c_items, clusters); 32 } 33 }else{ 34 if (cluster.items >= min_c_items){ 35 cluster.split = 1; 36 contents.addElement(cluster); 37 clusters[0]++; 38 } 19 39 } 20 21 // counts the number of clusters that are less dissimilar than disim 22 private void sweep_tree (Cluster cluster, double disim, int min_c_items, TheInt clusters) 23 { 24 cluster._split = 0; 25 26 if (cluster._id == -1) 27 { 28 if (cluster._children_sim <= disim && cluster._items >= min_c_items) 29 { 30 cluster._split = 1; 31 contents.addElement(cluster); 32 clusters.val++; 33 } 34 else 35 { 36 sweep_tree (cluster._child_1, disim, min_c_items, clusters); 37 sweep_tree (cluster._child_2, disim, min_c_items, clusters); 38 } 39 } 40 else 41 { 42 if (cluster._items >= min_c_items) 43 { 44 cluster._split = 1; 45 contents.addElement(cluster); 46 clusters.val++; 47 } 48 } 49 } 40 } 50 41 51 42 52 double split (Cluster cluster, double[] heights, int n, int target, 53 int min_c_items, TheInt clusters) 54 { 55 56 /* 57 Try to split 'cluster', made up of 'n' vectors, into 'target' 58 clusters by incrementing iteratively the mimimum required 59 similarity between the 'cluster\'s' children. The result is an 60 array of clusters, 'cluster_ary' which contains 'clusters' 61 clusters. Obviously the final number of clusters may not be the 62 same as the desired number of clusters. The function return the 63 threshold similarity which resulted in the clusters being split 64 in 'clusters' clusters. 65 */ 66 67 int i; 68 69 if (target < 2) 70 target = 2; 71 72 if (target > n) 73 target = n; 74 75 // height store minimum similarity values 76 // this is then the iterative incrementing loop 77 for (i = target - 2; i <= n - 1; i++) 78 { 79 clusters.val = 0; // simple class for an integer to 80 // be able to pass it by reference (below) 81 contents.removeAllElements(); 82 sweep_tree (cluster, heights[i], min_c_items, clusters); 83 84 if (clusters.val >= target) 85 break; 86 } 87 88 // if target - 2 > n - 1, i.e. if the previous loop has not been 89 // entered at all 90 if (i > n - 1) 91 { 92 /* ignore the min_c_size requirement */ 93 i = target - 1; 94 clusters.val = 0; 95 contents.removeAllElements(); 96 sweep_tree (cluster, heights[i], 1, clusters); 97 } 98 99 return heights[i]; 100 43 double split (Cluster cluster, double[] heights, int n, int target, int min_c_items, int[] clusters) 44 { 45 46 /* 47 Try to split 'cluster', made up of 'n' vectors, into 'target' 48 clusters by incrementing iteratively the mimimum required 49 similarity between the 'cluster\'s' children. The result is an 50 array of clusters, 'cluster_ary' which contains 'clusters' 51 clusters. Obviously the final number of clusters may not be the 52 same as the desired number of clusters. The function return the 53 threshold similarity which resulted in the clusters being split 54 in 'clusters' clusters. 55 */ 56 57 int i; 58 59 if (target < 2) target = 2; 60 61 if (target > n) target = n; 62 63 // height store minimum similarity values 64 // this is then the iterative incrementing loop 65 for (i = target - 2; i <= n - 1; i++){ 66 clusters[0] = 0; 67 contents.removeAllElements(); 68 sweep_tree (cluster, heights[i], min_c_items, clusters); 69 70 if (clusters[0] >= target) break; 71 } 72 73 // if target - 2 > n - 1, i.e. if the previous loop has not been 74 // entered at all 75 if (i > n - 1){ 76 /* ignore the min_c_size requirement */ 77 i = target - 1; 78 clusters[0] = 0; 79 contents.removeAllElements(); 80 sweep_tree (cluster, heights[i], 1, clusters); 81 } 82 83 return heights[i]; 101 84 } 102 103 85 } -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Clustering.java
r8189 r8281 1 1 package vishnu.cluster; 2 2 3 /* Related word hierarchical clustering using a given triangle similarity matrix */ 3 import java.util.*; 4 4 5 import java.util.*;6 5 7 6 public class Clustering … … 12 11 public static final int BUCKSHOT = 0; 13 12 public static final int COMPLETE = 1; 14 15 private Vector [] theResult;16 private double [][] theCentroids;17 18 public Vector [] get TheResults()13 14 private Vector [] clusters; 15 private double [][] centroids; 16 17 public Vector [] getClusters() 19 18 { 20 return theResult;19 return clusters; 21 20 } 22 21 23 public double[][] get TheCentroids()22 public double[][] getCentroids() 24 23 { 25 return theCentroids;24 return centroids; 26 25 } 27 28 29 public Clustering (int linkage, int [][]m, 30 int num_of_vectors, 31 int arraySize, int min_c_items, 32 int target_clusters, int limit) 26 27 28 public Clustering (int linkage, float[][] KDMatrix, int min_c_items, int target_clusters, int limit) 33 29 { 34 35 30 SimMatrix sim_matrix; 36 31 LinkFunc linkage_func=null; 37 32 38 // number of keywords 39 int VECTOR_SIZE = arraySize;33 int wordCount = KDMatrix[0].length; 34 int hitDocs = KDMatrix.length; 40 35 41 36 Cluster toplevel_cluster; 42 37 ClusterArray splitclusters = null; 43 TheInt clusters = new TheInt(0);38 int[] n_clusters = new int[]{0}; 44 39 45 40 double threshold; … … 48 43 49 44 double[] heights; 50 double [][] vectors = intToDoubleMatrix(m); 51 int [] coding = new int[m.length];45 46 int[] coding = new int[hitDocs]; 52 47 53 // populate array with numbers from 0 to #docs-1 54 for (int c =0;c<m.length;c++) coding[c]=c;48 49 for (int c = 0; c < hitDocs; c++) coding[c] = c; 55 50 56 int i, j; 51 switch (linkage){ 52 case COMPLETE_LINKAGE: 53 linkage_func = new MaxLink(); 54 break; 55 case AVERAGE_LINKAGE: 56 linkage_func = new AveLink(); 57 break; 58 case SINGLE_LINKAGE: 59 linkage_func = new MinLink(); 60 break; 61 default: 62 System.exit(0); 63 } 57 64 58 System.out.println("target_clusters " + target_clusters + " limit " + limit); 59 switch (linkage) 60 { 61 case COMPLETE_LINKAGE: 62 linkage_func = new MaxLink(); 63 break; 64 case AVERAGE_LINKAGE: 65 linkage_func = new AveLink(); 66 break; 67 case SINGLE_LINKAGE: 68 linkage_func = new MinLink(); 69 break; 70 default: 71 System.out.println("clustering.c: main(): linkage type "+ 72 linkage +" not defined."); 73 System.exit(0); 74 } 65 66 /**** make sure upper bound does not exceed #docs ****/ 67 68 if(limit > hitDocs) limit = hitDocs; 69 limit = (int)hitDocs/2; 75 70 76 long startTime = System.currentTimeMillis();77 long readTime = System.currentTimeMillis();78 79 80 // if fewer documents than upper cluster limit81 // I think this is an error, should be vectors.length, viz. #documents82 // which was never spotted as limit is always smaller83 // than #keywords84 85 if(limit > vectors[0].length) limit = vectors[0].length;86 71 87 // contains distance matrix between documents, built using 88 // the double vector[i][] list of keyword frequencies 89 90 sim_matrix = new SimMatrix(vectors, limit, sim_func, VECTOR_SIZE); 91 92 long simMatixTime = System.currentTimeMillis(); 93 System.out.println("hierarchical clustering " + limit + "...." ); 94 //System.out.flush(); 95 96 /* an array of heights sorted by descending disim */ 97 // as many heights as there are docs 98 heights = new double[m.length];//[limit - 1]; 99 100 72 /**** get similarity matrix of the first limit docs with each other ****/ 73 74 sim_matrix = new SimMatrix(KDMatrix, limit, sim_func); 75 76 heights = new double[hitDocs]; 77 101 78 Hierarchical hierarchical = new Hierarchical(limit); 102 79 103 toplevel_cluster = hierarchical.clustering (sim_matrix, limit, heights, 104 linkage_func); 105 106 double d[] = toplevel_cluster.getCentroid(); 107 if( d != null ) 108 System.out.println("First element of centroid: " + d[0]); 109 else 110 System.out.println("Centroid null"); 80 toplevel_cluster = hierarchical.clustering(sim_matrix, limit, heights, linkage_func); 81 if( toplevel_cluster == null ) 82 return; 111 83 112 long hierarchicalTime = System.currentTimeMillis(); 84 System.out.println("Clustered items: " + toplevel_cluster.items); 85 Cluster c1 = toplevel_cluster.child_1; 86 Cluster c2 = toplevel_cluster.child_2; 87 System.out.println("Child1: " + c1.items); 88 System.out.println("Child2: " + c2.items); 89 90 splitclusters = new ClusterArray(0); 113 91 114 System.out.println("split into"); 115 //System.out.flush(); 116 splitclusters=new ClusterArray(0); 117 118 threshold = splitclusters.split(toplevel_cluster, heights, limit, 119 target_clusters, 120 min_c_items, clusters); 92 threshold = splitclusters.split(toplevel_cluster, heights, limit, target_clusters, min_c_items, n_clusters); 121 93 122 long splitTime = System.currentTimeMillis(); 123 System.out.println(clusters.val + " clusters threshold (" + 124 threshold + ").... "); 125 //System.out.flush(); 94 System.out.println(n_clusters[0] + " clusters"); 95 System.out.println("Threshold: " + threshold); 96 97 System.out.println("Target_clusters " + target_clusters); 98 System.out.println("Limit " + limit); 99 System.out.println("#docs: " + hitDocs); 100 101 Iterative iter = new Iterative(wordCount); 102 103 iter.clustering(splitclusters, n_clusters, KDMatrix, limit, hitDocs, sim_func); 104 105 System.out.println("Finished iterative clustering"); 126 106 127 System.out.println("iterative clustering............."); 128 System.out.println("target_clusters " + target_clusters + " limit " + 129 limit + " vectors length " + vectors.length); 130 131 System.out.flush(); 132 System.out.println("Cluster Length: " + clusters.val); 133 Iterative iter = new Iterative(VECTOR_SIZE); 134 135 iter.clustering (splitclusters, clusters, vectors, limit, 136 vectors.length, sim_func); 137 long iterativeTime = System.currentTimeMillis(); 107 clusters = new Vector[n_clusters[0]]; 108 centroids = new double[n_clusters[0]][wordCount]; 138 109 139 theResult = new Vector [clusters.val]; 140 theCentroids = new double[clusters.val][arraySize]; 141 142 for (i = 0; i < clusters.val; i++) 143 { 144 theResult[i]=new Vector(); 145 Cluster clust = (Cluster)splitclusters.contents.elementAt(i); 146 clust.addChildrenToOutputVector(coding,theResult[i]); 147 148 for (j = 0; j < clust._iter_items_num; j++) 149 { 150 TheInt ii = (TheInt)clust._iter_items.elementAt(j); 151 Integer jj = new Integer (coding[ii.val]); 152 theResult[i].addElement(jj); 153 } 154 double [] centroid = clust.getCentroid(); 155 for (j = 0; j < arraySize; j++) 156 { 157 theCentroids[i][j]=centroid[j]; 158 } 159 } 160 161 long printTime = System.currentTimeMillis(); 162 System.out.println(" Vector size " + vectors.length + " limit " + limit + " Clusters # " + clusters.val); 163 double printTimeSecs = (printTime-iterativeTime)/1000.0; 164 double iterativeTimeSecs = (iterativeTime - splitTime)/1000.0; 165 double splitTimeSecs = (splitTime - hierarchicalTime)/1000.0; 166 double hierarchicalTimeSecs = (hierarchicalTime - simMatixTime)/1000.0; 167 double simMatixTimeSecs = (simMatixTime - readTime)/1000.0; 168 double readTimeSecs = (readTime - startTime)/1000.0; 169 System.out.println("Timings:"); 170 //System.out.println("Read from file: "+ readTimeSecs+" seconds"); 171 System.out.println("Create Similarity matrix: "+ simMatixTimeSecs+" seconds"); 172 System.out.println("Hierarchical Clustering: "+ hierarchicalTimeSecs+" seconds"); 173 System.out.println("Split Clusters: "+ splitTimeSecs+" seconds"); 174 System.out.println("Iterative Clustering: "+ iterativeTimeSecs+" seconds"); 175 System.out.println("Build results: "+ printTimeSecs+" seconds"); 110 for (int i = 0; i < n_clusters[0]; i++){ 111 clusters[i]=new Vector(); 112 Cluster clust = (Cluster)splitclusters.contents.elementAt(i); 113 clust.addChildrenToOutputVector(coding,clusters[i]); 114 115 for (int j = 0; j < clust.iter_items_num; j++){ 116 Integer ii = (Integer)clust.iter_items.elementAt(j); 117 Integer jj = new Integer (coding[ii.intValue()]); 118 clusters[i].addElement(jj); 119 } 120 121 float [] centroid = clust.getCentroid(); 122 123 for (int j = 0; j < wordCount; j++){ 124 centroids[i][j]=centroid[j]; 125 } 126 } 176 127 } 177 178 static double[][] intToDoubleMatrix(int [][] m)179 {180 int h=m.length;181 double [][] res = null;182 if (h > 0)183 {184 int w = m[0].length;185 res = new double[h][w];186 for(int c=0;c<h;c++)187 for(int d = 0; d<w; d++)188 res[c][d]=(double)m[c][d];189 }190 return res;191 }192 193 128 } -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Distance.java
r8189 r8281 5 5 public abstract class Distance 6 6 { 7 public abstract double get(double[] v1, double[] v2, int size);7 public abstract float get(float[] v1, float[] v2); 8 8 } 9 9 -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/EuclideanDistance.java
r8189 r8281 3 3 public class EuclideanDistance extends Distance 4 4 { 5 public double get(double[] v1, double[] v2, int size)5 public float get(float[] v1, float[] v2) 6 6 { 7 double sum = 0, x; 8 //System.out.println("Array length "+ v1.length); 9 for(int c=0;c<v1.length;c++) 10 { 11 x = v1[c] - v2[c]; 7 float sum = 0; 8 9 for(int c=0;c<v1.length;c++){ 10 float x = v1[c] - v2[c]; 12 11 sum += x*x; 13 12 } 14 return Math.sqrt(sum);13 return (float)Math.sqrt(sum); 15 14 } 16 15 } -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Hierarchical.java
r8189 r8281 3 3 public class Hierarchical 4 4 { 5 public double[] minrow; 6 public int[] colind; 7 //public SimMatrix smatrix; 8 public TriangleIndex triangle; 9 10 public Hierarchical(int matrix_rows) 11 { 12 // create Triangle with "limit" arg = maximum #clusters 13 // an object to help with indexing the cells 14 // in a triangular matrix 15 triangle = new TriangleIndex(matrix_rows); 16 } 17 5 public float[] minrow; 6 public int[] colind; 7 public TriangleIndex triangle; 8 9 public Hierarchical(int matrix_rows){ 10 // create Triangle with "limit" arg = maximum #clusters 11 // an object to help with indexing the cells 12 // in a triangular matrix 13 triangle = new TriangleIndex(matrix_rows); 14 } 15 16 18 17 // set each minrow[i] to the value of the minimum element of that row 19 20 21 18 void set_minrow (SimMatrix m, int row) 19 { 20 // triangular matrix without diagonal, hence zeroth row is all zero 22 21 if(row == 0) return; 23 24 25 22 23 int i; 24 int index_start, index_end; 26 25 27 26 // first column 28 index_start = triangle.index (row, 0); 29 // last column (row(!)-1 because as many rows as columns) 30 index_end = triangle.index (row, row - 1); 31 32 minrow[row] = m.matrix[index_start]; 33 colind[row] = index_start; 34 35 for (i = index_start + 1; i <= index_end; i++) 36 { 37 if (m.matrix[i] < minrow[row]) 38 { 39 minrow[row] = m.matrix[i]; 40 colind[row] = i; 41 } 42 } 27 index_start = triangle.index (row, 0); 28 29 // last column (row(!)-1 because as many rows as columns) 30 index_end = triangle.index (row, row - 1); 31 32 minrow[row] = m.matrix[index_start]; 33 colind[row] = index_start; 34 35 for (i = index_start + 1; i <= index_end; i++){ 36 if (m.matrix[i] < minrow[row]){ 37 minrow[row] = m.matrix[i]; 38 colind[row] = i; 39 } 43 40 } 44 41 } 42 43 45 44 // return the ROW in which the minimum disimilarity resides 46 int get_minrow (int last_row) 47 { 48 int i; 49 int row; 50 double value; 51 52 row = 1; 53 value = minrow[1]; 54 55 for (i = 2; i <= last_row; i++) 56 { 57 if (minrow[i] < value) 58 { 59 row = i; 60 value = minrow[i]; 61 } 45 int get_minrow (int last_row){ 46 int i; 47 int row; 48 float value; 49 50 row = 1; 51 value = minrow[1]; 52 53 for (i = 2; i <= last_row; i++){ 54 if (minrow[i] < value){ 55 row = i; 56 value = minrow[i]; 57 } 62 58 } 63 64 return row; 59 60 return row; 61 } 62 63 64 // num_of_rows is "limit" = max docs to be clustered in hierarchical clustering 65 Cluster clustering (SimMatrix sim_matrix, int num_of_rows, double[] heights, LinkFunc linkage_func) 66 { 67 Cluster child_1; 68 Cluster child_2; 69 Cluster parent=null; 70 ClusterArray cluster_ary; 71 72 int min_row, index, row, col, i; 73 double sim; 74 75 int last_row_index, sim_matrix_size; 76 77 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2; 78 79 // Place each of the documents in a cluster of each own 80 // there are vector.length docs but only num_of_rows = limit places in the array 81 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest 82 cluster_ary = new ClusterArray (num_of_rows); 83 last_row_index = num_of_rows - 1; 84 85 // minimum distance for each row of the sim_matrix 86 minrow = new float[num_of_rows]; 87 88 // column index for minimum distance entry 89 colind = new int[num_of_rows]; 90 91 for (i = 1; i < num_of_rows; i++) set_minrow(sim_matrix, i); 92 93 // do all the hierarchical clustering by running over all rows, updating the matrix 94 // as a new cluster is formed each time 95 for (i = 0; i < num_of_rows - 1; i++){ 96 97 // find the row with the lowest disimilarity 98 min_row = get_minrow(last_row_index); 99 sim = minrow[min_row]; 100 101 System.out.println("Min sim: " + sim); 102 103 // the column with the lowest distance in that row 104 index = colind[min_row]; 105 106 // array of heights, DESCENDING disim, i.e. sim goes last 107 heights[num_of_rows - 2 - i] = sim; 108 109 row = triangle.i_ind[index]; 110 col = triangle.j_ind[index]; 111 112 // merges rows and cols to create a smaller sim_matrix 113 merge (row, col, sim_matrix, last_row_index, linkage_func); 114 System.out.println("Merging: " + row + " and " + col); 115 116 sim_matrix_size -= last_row_index; 117 118 child_1 = (Cluster)cluster_ary.contents.elementAt(row); 119 child_2 = (Cluster)cluster_ary.contents.elementAt(col); 120 121 // parents get id = -1 to indicate that they are not representing single docs 122 // all leaves have an id (1 - num_rows) that points to the doc they contain 123 int sum = child_1.items + child_2.items; 124 System.out.println("Create new parent with " + child_1.items + " + " + child_2.items + " leaves"); 125 parent = new Cluster (-1, null, child_1, child_2, sim, child_1.items + child_2.items); 126 127 child_1.parent = parent; 128 child_2.parent = parent; 129 130 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row); 131 cluster_ary.contents.setElementAt(parent,col); 132 133 last_row_index--; 65 134 } 66 67 // num_of_rows is "limit" = max #clusters 68 Cluster clustering (SimMatrix sim_matrix, int num_of_rows, 69 double[] heights, LinkFunc linkage_func) 70 { 71 Cluster child_1; 72 Cluster child_2; 73 Cluster parent=null; 74 ClusterArray cluster_ary; 75 76 77 int min_row, index, row, col, i; 78 double sim; 79 80 int last_row_index, sim_matrix_size; 81 82 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2; 83 84 // Place each of the documents in a cluster of each own 85 // there are vector.length docs but only num_of_rows = limit places in the array 86 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest 87 cluster_ary = new ClusterArray (num_of_rows); 88 last_row_index = num_of_rows - 1; 89 90 // a variable that is a double and exists for each row - what can this be? 91 // the lowest dissimilarity score? 92 minrow = new double[num_of_rows]; 93 94 // a variable that is an int and exists for each row - what can that be? 95 // the document index of the document with that lowest score? 96 colind = new int[num_of_rows]; 97 98 for (i = 1; i < num_of_rows; i++) 99 { 100 set_minrow(sim_matrix, i); 101 } 102 103 // do all the hierarchical clustering by running over all rows, updating the matrix 104 // as a new cluster is formed each time 105 for (i = 0; i < num_of_rows - 1; i++) 106 { 107 // find the row with the lowest disimilarity 108 min_row = get_minrow(last_row_index); 109 sim = minrow[min_row]; 110 111 // the column with the lowest dis... (sim) in that row 112 index = colind[min_row]; 113 114 // array of heights, DESCENDING disim, i.e. sim goes last 115 heights[num_of_rows - 2 - i] = sim; 116 117 row = triangle.i_ind[index]; 118 col = triangle.j_ind[index]; 119 120 merge (row, col, sim_matrix, last_row_index, linkage_func); 121 122 sim_matrix_size -= last_row_index; 123 124 child_1 = (Cluster)cluster_ary.contents.elementAt(row); 125 child_2 = (Cluster)cluster_ary.contents.elementAt(col); 126 127 // parents get id = -1 to indicate that they are not representing single docs 128 // all leaves have an id (1 - num_rows) that points to the doc they contain 129 parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items); 130 131 child_1._parent = parent; 132 child_2._parent = parent; 133 134 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row); 135 cluster_ary.contents.setElementAt(parent,col); 136 137 last_row_index--; 138 } 139 140 return parent; 141 142 } 143 135 136 return parent; 137 } 138 144 139 // form a new cluster from the two most similar documents 145 140 void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func) 146 { 147 /* i and j are the matrix elements with maximum similarity */ 148 /* matrix is the triangle similarity matrix with last row index n */ 149 /* linkage_func is a pointer to the linkage function */ 150 151 /* i > j */ 152 153 int c, k, index1, index2; 154 141 { 142 143 /* i and j are the matrix elements with maximum similarity */ 144 /* matrix is the triangle similarity matrix with last row index n */ 145 /* linkage_func is a pointer to the linkage function */ 146 147 /* i > j */ 148 149 int c, k, index1, index2; 150 155 151 // go over all rows 156 for (k = 0; k <= n; k++) 157 { 152 for (k = 0; k <= n; k++){ 153 158 154 // if k < j the affected elements are on row j 159 if (k < j) 160 { 161 index1 = triangle.index (k, j); 162 index2 = triangle.index (k, i); 163 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 164 165 //printf ("1 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 166 } 167 else if (k > j && k < i) 168 { 169 /* if j < k < i then only one element is updated per row */ 170 index1 = triangle.index (k, j); 171 index2 = triangle.index (k, i); 172 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 173 //printf ("2 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, m.matrix[index2]); 174 175 if (index1 == colind[k]) 176 { 177 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 178 set_minrow(m, k); 179 } 180 else 181 { 182 /* if the changed element is smaller than the current minimum update minrow and colind */ 183 if (m.matrix[index1] < minrow[k]) 184 { 185 minrow[k] = m.matrix[index1]; 186 colind[k] = index1; 187 } 188 } 189 } 190 else if (k > i && k < n) 191 { 192 index1 = triangle.index (k, j); 193 index2 = triangle.index (k, i); 194 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 195 //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 196 197 if (index1 == colind[k]) 198 { 199 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 200 set_minrow(m, k); 201 } 202 else 203 { 204 /* if the changed element is smaller than the current minimum update minrow and colind */ 205 if (m.matrix[index1] < minrow[k]) 206 { 207 minrow[k] = m.matrix[index1]; 208 colind[k] = index1; 209 } 210 } 211 212 index1 = triangle.index (k, i); 213 index2 = triangle.index (k, n); 214 m.matrix[index1] = m.matrix[index2]; 215 //printf (" copy [%d, %d] to [%d, %d] = %f\n", k, n, k, i, matrix[index2]); 216 217 if (index1 == colind[k]) 218 { 219 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 220 set_minrow(m, k); 221 } 222 else 223 { 224 /* if the changed element is smaller than the current minimum update minrow and colind */ 225 if (m.matrix[index1] < minrow[k]) 226 { 227 minrow[k] = m.matrix[index1]; 228 colind[k] = index1; 229 } 230 } 231 } 232 else if (k == n && i != n) 233 { 234 index1 = triangle.index (k, j); 235 index2 = triangle.index (k, i); 236 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 237 //printf ("5 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 238 239 if (index1 == colind[k]) 240 { 241 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 242 set_minrow(m, k); 243 } 244 else 245 { 246 /* if the changed element is smaller than the current minimum update minrow and colind */ 247 if (m.matrix[index1] < minrow[k]) 248 { 249 minrow[k] = m.matrix[index1]; 250 colind[k] = index1; 251 } 252 } 253 254 for (c = 0; c < i; c++) 255 { 256 index1 = triangle.index (c, i); 257 index2 = triangle.index (c, n); 258 m.matrix[index1] = m.matrix[index2]; 259 //printf ("copy [%d, %d] to [%d, %d] = %f\n", c, n, c, i, matrix[index2]); 260 261 } 262 263 } 155 if (k < j){ 156 index1 = triangle.index (k, j); 157 index2 = triangle.index (k, i); 158 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 159 }else if (k > j && k < i){ 160 /* if j < k < i then only one element is updated per row */ 161 index1 = triangle.index (k, j); 162 index2 = triangle.index (k, i); 163 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 164 165 if (index1 == colind[k]){ 166 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 167 set_minrow(m, k); 168 }else{ 169 /* if the changed element is smaller than the current minimum update minrow and colind */ 170 if (m.matrix[index1] < minrow[k]){ 171 minrow[k] = m.matrix[index1]; 172 colind[k] = index1; 173 } 174 } 175 }else if (k > i && k < n){ 176 index1 = triangle.index (k, j); 177 index2 = triangle.index (k, i); 178 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 179 //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 180 181 if (index1 == colind[k]){ 182 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 183 set_minrow(m, k); 184 }else{ 185 /* if the changed element is smaller than the current minimum update minrow and colind */ 186 if (m.matrix[index1] < minrow[k]){ 187 minrow[k] = m.matrix[index1]; 188 colind[k] = index1; 189 } 190 } 191 192 index1 = triangle.index (k, i); 193 index2 = triangle.index (k, n); 194 m.matrix[index1] = m.matrix[index2]; 195 196 if (index1 == colind[k]){ 197 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 198 set_minrow(m, k); 199 }else{ 200 /* if the changed element is smaller than the current minimum update minrow and colind */ 201 if (m.matrix[index1] < minrow[k]){ 202 minrow[k] = m.matrix[index1]; 203 colind[k] = index1; 204 } 205 } 206 }else if(k == n && i != n){ 207 index1 = triangle.index (k, j); 208 index2 = triangle.index (k, i); 209 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 210 211 if (index1 == colind[k]){ 212 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 213 set_minrow(m, k); 214 }else{ 215 /* if the changed element is smaller than the current minimum update minrow and colind */ 216 if (m.matrix[index1] < minrow[k]){ 217 minrow[k] = m.matrix[index1]; 218 colind[k] = index1; 219 } 220 } 221 222 for (c = 0; c < i; c++){ 223 index1 = triangle.index (c, i); 224 index2 = triangle.index (c, n); 225 m.matrix[index1] = m.matrix[index2]; 226 } 227 228 } 264 229 } 265 set_minrow(m, i); 266 set_minrow(m, j); 267 } 268 230 set_minrow(m, i); 231 set_minrow(m, j); 232 } 269 233 } 270 234 -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Iterative.java
r8189 r8281 6 6 { 7 7 Vector vector; 8 int VECTOR_SIZE;9 8 int wordCount; 9 10 10 Iterative(int vsize) 11 11 { 12 VECTOR_SIZE= vsize;12 wordCount = vsize; 13 13 } 14 14 15 15 public void printVector(double[] a) 16 16 { 17 17 int i; 18 19 for (i = 0; i < VECTOR_SIZE; i++)18 19 for (i = 0; i < wordCount; i++) 20 20 System.out.println("v[" + i + "] = " + a[i]); 21 21 } 22 22 23 23 24 void cluster_center (Cluster cluster, double [][] vectors, double[] centroid) 24 void cluster_center (Cluster cluster, float [][] vectors, float[] centroid) 25 { 26 int i; 27 28 if (cluster.id == -1){ 29 cluster_center (cluster.child_1, vectors, centroid); 30 cluster_center (cluster.child_2, vectors, centroid); 31 }else{ 32 float[] doubleArray = vectors[cluster.id]; 33 for (i = 0; i < doubleArray.length; i++) 34 centroid[i] += doubleArray[i]; 35 } 36 } 37 38 void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, float [][] vectors) 39 { 40 int i, j; double ssum; 41 Cluster clust; 42 43 44 /**** for each hierarchical cluster, compute centroid ****/ 45 for (i = 0; i < num_of_clusters; i++){ 46 clust = (Cluster)clusters.contents.elementAt(i); 47 clust.centroid = new float[wordCount]; 48 cluster_center (clust, vectors, clust.centroid); 49 50 for (ssum = j = 0; j < wordCount; j++) 51 ssum += clust.centroid[j] * clust.centroid[j]; 52 53 if(ssum>0) // normalise 54 for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < wordCount; j++) 55 clust.centroid[j] *= ssum; 56 } 57 } 58 59 60 void place_item (ClusterArray clusters, int num_of_clusters, float [][] vectors, int item_id, Distance sim_func) 61 { 62 Cluster clust = (Cluster)clusters.contents.elementAt(0); 63 double minDist = sim_func.get(clust.centroid,vectors[item_id]); 64 int x_minDist = 0; 65 66 for (int i = 1; i < num_of_clusters; i++){ 67 clust = (Cluster)clusters.contents.elementAt(i); 68 double dist = sim_func.get(clust.centroid,vectors[item_id]); 69 70 if (dist < minDist){ 71 x_minDist = i; 72 minDist = dist; 73 } 74 } 75 76 //System.out.println("in " + x_minDist); 77 clust = (Cluster)clusters.contents.elementAt(x_minDist); 78 clust.iter_items.addElement(new Integer(item_id)); 79 clust.iter_items_num++; 80 } 81 82 83 // add all documents not yet added to the existing cluster hierarchy (which is based on the 84 // limit first documents. Hence there are vector.length - limit docs to add 85 // consequently index runs from limit (start_index) to vector.length (end_index) 86 void clustering (ClusterArray clusters, int[] num_of_clusters, float[][] vectors, int start_index, int end_index,Distance sim_func) 25 87 { 26 88 int i; 27 89 28 if (cluster._id == -1) 29 { 30 cluster_center (cluster._child_1, vectors, centroid); 31 cluster_center (cluster._child_2, vectors, centroid);32 } 33 else 34 { 35 double[] doubleArray = vectors[cluster._id]; 36 for (i = 0; i < doubleArray.length; i++) 37 centroid[i] += doubleArray[i];38 90 System.out.println("Start: " + start_index + " Finish: " + end_index); 91 92 if (start_index >= end_index) 93 return; 94 95 compute_cluster_centers(clusters, num_of_clusters[0], vectors); 96 97 for (i = start_index; i < end_index; i++){ 98 System.out.println("Placing document " + i); 99 place_item(clusters, num_of_clusters[0], vectors, i, sim_func); 100 } 39 101 } 40 41 void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, double [][] vectors)42 {43 int i, j; double ssum;44 Cluster clust;45 46 for (i = 0; i < num_of_clusters; i++)47 {48 clust = (Cluster)clusters.contents.elementAt(i);49 clust._centroid = new double[VECTOR_SIZE];50 cluster_center (clust, vectors, clust._centroid);51 for (ssum = j = 0; j < VECTOR_SIZE; j++)52 ssum += clust._centroid[j] * clust._centroid[j];53 if(ssum>0) // normalise54 for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < VECTOR_SIZE; j++)55 clust._centroid[j] *= ssum;56 }57 }58 59 void place_item (ClusterArray clusters,60 int num_of_clusters,61 double [][] vectors,62 int item_id, Distance sim_func)63 {64 double max_sim, sim;65 int max_sim_index;66 67 int i;68 69 max_sim_index = 0;70 Cluster clust = (Cluster)clusters.contents.elementAt(0);71 max_sim = sim_func.get(clust._centroid,72 vectors[item_id], VECTOR_SIZE);73 74 for (i = 1; i < num_of_clusters; i++)75 {76 clust = (Cluster)clusters.contents.elementAt(i);77 sim = sim_func.get(clust._centroid,78 vectors[item_id], VECTOR_SIZE);79 80 if (sim < max_sim)81 {82 max_sim_index = i;83 max_sim = sim;84 }85 }86 87 clust = (Cluster)clusters.contents.elementAt(max_sim_index);88 clust._iter_items.addElement(new TheInt(item_id));89 clust._iter_items_num++;90 }91 92 // add all documents not yet added to the existing cluster hierarchy (which is based on the93 // limit first documents. Hence there are vector.length - limit docs to add94 // consequently index runs from limit (start_index) to vector.length (end_index)95 void clustering (ClusterArray clusters, TheInt num_of_clusters,96 double[][] vectors, int start_index, int end_index,97 Distance sim_func)98 {99 int i;100 101 if (start_index >= end_index)102 return;103 104 compute_cluster_centers(clusters, num_of_clusters.val, vectors);105 106 for (i = start_index; i < end_index; i++)107 place_item(clusters, num_of_clusters.val, vectors, i, sim_func);108 }109 110 102 } -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/LinkFunc.java
r8189 r8281 5 5 public abstract class LinkFunc 6 6 { 7 abstract double link(double a, doubleb);7 abstract float link(float a, float b); 8 8 } -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MaxLink.java
r8189 r8281 3 3 public class MaxLink extends LinkFunc 4 4 { 5 public double link(double a, doubleb)5 public float link(float a, float b) 6 6 { 7 7 return (a > b? a*99+b: b*99+a)/100; -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MinLink.java
r8189 r8281 3 3 public class MinLink extends LinkFunc 4 4 { 5 public double link(double a, doubleb)5 public float link(float a, float b) 6 6 { 7 7 return (a < b? a*999+b: b*999+a)/1000; -
trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/SimMatrix.java
r8189 r8281 3 3 import java.util.*; 4 4 5 // kainof DistanceMatrix6 5 public class SimMatrix 7 6 { 8 public double[] matrix; 7 8 public float[] matrix; 9 9 10 10 // "vectors" is the expanded SparseMatrix, i.e. doc x keywords 11 // VECTOR_SIZE = #keywords 12 public SimMatrix(double[][] vectors, 13 int limit, Distance simfunc, int VECTOR_SIZE) 14 { 15 int i,j; 16 double iarray[]; 17 18 // size = #docs 19 int size=vectors.length, index=0; 20 21 // to exclude diagonal and conjugates 22 // - - - - - 23 // | 24 // * | 25 // * * | 26 // * * * | 27 int realSize = (size * (size - 1)/2); 28 matrix=new double[realSize]; 29 System.out.println("Similarity matrix size: " + size); 30 31 for(i = 1; i < size; i++) 11 public SimMatrix(float[][] KDMatrix, int limit, Distance simfunc){ 12 13 float iarray[]; 14 15 int hitDocs = KDMatrix.length; 16 int wordCount = KDMatrix[0].length; 17 int index = 0; 18 19 // to exclude diagonal and conjugates 20 // - - - - - 21 // | 22 // * | 23 // * * | 24 // * * * | 25 26 int realSize = (hitDocs * (hitDocs - 1)/2); 27 matrix = new float[realSize]; 28 29 // this gets the sim matrix for all docs rather 30 // than only the first limit - mistake? 31 32 for(int i = 1; i < hitDocs; i++){ 33 iarray = KDMatrix[i]; 34 for(int j = 0; j < i; j++){ 35 matrix[index] = simfunc.get(iarray,KDMatrix[j]); 36 index++; 37 } 38 } 39 } 40 41 public void printMatrix() 32 42 { 33 iarray=vectors[i]; 34 for(j = 0; j < i; j++) 35 { 36 matrix[index] = simfunc.get(iarray, 37 vectors[j], 38 VECTOR_SIZE); 39 index++; 40 } 43 int countA = 0; 44 int countB = 0; 45 46 for( int i = 0; i < matrix.length; i++ ){ 47 System.out.print(matrix[i] + " "); 48 if( countA >= countB ){ 49 System.out.println(); 50 countB++; 51 countA = 0; 52 } 53 else countA++; 54 } 41 55 } 42 }43 44 56 } 45 57 46 58 59 60 61 62 63 64 65 66 67 68 69 70 -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/AveLink.java
r8189 r8281 3 3 public class AveLink extends LinkFunc 4 4 { 5 double link(double a, doubleb)5 float link(float a, float b) 6 6 { 7 7 return (a+b)/2; -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Cluster.java
r8189 r8281 5 5 public class Cluster 6 6 { 7 public int _id;8 public Cluster _parent = null;9 public Cluster _child_1 = null;10 public Cluster _child_2 = null;11 12 public double _children_sim;13 14 // probably the size of thecluster15 public int _items;16 17 public int _split;18 7 public int id; 8 public Cluster parent = null; 9 public Cluster child_1 = null; 10 public Cluster child_2 = null; 11 12 public double children_sim; 13 14 // size of cluster 15 public int items; 16 17 public int split; 18 19 19 // [] -> centroid for each vector element ? 20 public double[] _centroid=null; 21 22 public Vector _iter_items = null; 23 public int _iter_items_num = 0; 24 25 public int _x = 0; 26 public int _y = 0; 27 public double[] getCentroid(){return _centroid;} 28 29 public Cluster(int id, Cluster parent, Cluster child_1, Cluster child_2, double children_sim, int items) 30 { 31 _id = id; 32 _children_sim = children_sim; 33 _items = items; 34 _split = 0; 35 _parent = parent; 36 _iter_items = new Vector(0); 20 public float[] centroid=null; 21 22 public Vector iter_items = null; 23 public int iter_items_num = 0; 24 25 public int x = 0; 26 public int y = 0; 27 28 public float[] getCentroid(){return centroid;} 29 30 public Cluster(int id, Cluster parent, Cluster c1, Cluster c2, double children_sim, int items) 31 { 32 this.id = id; 33 this.children_sim = children_sim; 34 this.items = items; 35 split = 0; 36 this.parent = parent; 37 iter_items = new Vector(0); 37 38 38 39 // probably to get a balanced tree 39 if (child_1 != null && child_2 != null) 40 { 41 if (child_2._items > child_1._items) 42 { 43 _child_1 = child_1; 44 _child_2 = child_2; 45 } 46 else 47 { 48 _child_1 = child_2; 49 _child_2 = child_1; 50 } 51 } 52 else // if either or both are null 53 { 54 _child_1 = child_1; 55 _child_2 = child_2; 40 if (c1 != null && c2 != null){ 41 if (c2.items > c1.items){ 42 child_1 = c1; 43 child_2 = c2; 44 }else{ 45 child_1 = c2; 46 child_2 = c1; 56 47 } 57 } 58 48 }else{ // if either or both are null 49 child_1 = c1; 50 child_2 = c2; 51 } 52 } 53 59 54 // a nice recursive function to determine the depth of the hierarchy 60 public int depth() 61 { 62 if (_id != -1) 63 return 0; 64 else 65 return Math.max(1 + _child_1.depth(), 1 + _child_2.depth()); 55 public int depth() 56 { 57 if (id != -1) 58 return 0; 59 else 60 return Math.max(1 + child_1.depth(), 1 + child_2.depth()); 61 } 62 63 64 void addChildrenToOutputVector(int []coding, Vector output) 65 { 66 if (id != -1){ 67 Integer ii = new Integer(coding[id]); 68 output.addElement(ii); 69 }else{ 70 child_1.addChildrenToOutputVector(coding,output); 71 child_2.addChildrenToOutputVector(coding,output); 66 72 } 67 68 void addChildrenToOutputVector(int []coding, Vector output)69 {70 if (_id != -1)71 {72 Integer ii = new Integer(coding[_id]);73 output.addElement(ii);74 75 }76 else77 {78 _child_1.addChildrenToOutputVector(coding,output);79 _child_2.addChildrenToOutputVector(coding,output);80 }81 73 } 82 83 74 } 84 75 -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/ClusterArray.java
r8189 r8281 6 6 public class ClusterArray 7 7 { 8 Vector contents; 9 10 ClusterArray(int num_of_clusters) 11 { 12 int i; 13 14 contents = new Vector(num_of_clusters); 15 16 for (i = 0; i < num_of_clusters; i++) 17 contents.addElement(new Cluster(i, null, null, null, 0, 1)); 18 // i.e. id = i, similarity between kids = 0, #items = 1 8 Vector contents; 9 10 ClusterArray(int num_of_clusters) 11 { 12 contents = new Vector(num_of_clusters); 13 14 for(int i = 0; i < num_of_clusters; i++) 15 contents.addElement(new Cluster(i, null, null, null, 0, 1)); 16 } 17 18 19 // counts the number of clusters that are less dissimilar than disim 20 private void sweep_tree (Cluster cluster, double disim, int min_c_items, int[] clusters) 21 { 22 cluster.split = 0; 23 24 if (cluster.id == -1){ 25 if (cluster.children_sim <= disim && cluster.items >= min_c_items){ 26 cluster.split = 1; 27 contents.addElement(cluster); 28 clusters[0]++; 29 }else{ 30 sweep_tree (cluster.child_1, disim, min_c_items, clusters); 31 sweep_tree (cluster.child_2, disim, min_c_items, clusters); 32 } 33 }else{ 34 if (cluster.items >= min_c_items){ 35 cluster.split = 1; 36 contents.addElement(cluster); 37 clusters[0]++; 38 } 19 39 } 20 21 // counts the number of clusters that are less dissimilar than disim 22 private void sweep_tree (Cluster cluster, double disim, int min_c_items, TheInt clusters) 23 { 24 cluster._split = 0; 25 26 if (cluster._id == -1) 27 { 28 if (cluster._children_sim <= disim && cluster._items >= min_c_items) 29 { 30 cluster._split = 1; 31 contents.addElement(cluster); 32 clusters.val++; 33 } 34 else 35 { 36 sweep_tree (cluster._child_1, disim, min_c_items, clusters); 37 sweep_tree (cluster._child_2, disim, min_c_items, clusters); 38 } 39 } 40 else 41 { 42 if (cluster._items >= min_c_items) 43 { 44 cluster._split = 1; 45 contents.addElement(cluster); 46 clusters.val++; 47 } 48 } 49 } 40 } 50 41 51 42 52 double split (Cluster cluster, double[] heights, int n, int target, 53 int min_c_items, TheInt clusters) 54 { 55 56 /* 57 Try to split 'cluster', made up of 'n' vectors, into 'target' 58 clusters by incrementing iteratively the mimimum required 59 similarity between the 'cluster\'s' children. The result is an 60 array of clusters, 'cluster_ary' which contains 'clusters' 61 clusters. Obviously the final number of clusters may not be the 62 same as the desired number of clusters. The function return the 63 threshold similarity which resulted in the clusters being split 64 in 'clusters' clusters. 65 */ 66 67 int i; 68 69 if (target < 2) 70 target = 2; 71 72 if (target > n) 73 target = n; 74 75 // height store minimum similarity values 76 // this is then the iterative incrementing loop 77 for (i = target - 2; i <= n - 1; i++) 78 { 79 clusters.val = 0; // simple class for an integer to 80 // be able to pass it by reference (below) 81 contents.removeAllElements(); 82 sweep_tree (cluster, heights[i], min_c_items, clusters); 83 84 if (clusters.val >= target) 85 break; 86 } 87 88 // if target - 2 > n - 1, i.e. if the previous loop has not been 89 // entered at all 90 if (i > n - 1) 91 { 92 /* ignore the min_c_size requirement */ 93 i = target - 1; 94 clusters.val = 0; 95 contents.removeAllElements(); 96 sweep_tree (cluster, heights[i], 1, clusters); 97 } 98 99 return heights[i]; 100 43 double split (Cluster cluster, double[] heights, int n, int target, int min_c_items, int[] clusters) 44 { 45 46 /* 47 Try to split 'cluster', made up of 'n' vectors, into 'target' 48 clusters by incrementing iteratively the mimimum required 49 similarity between the 'cluster\'s' children. The result is an 50 array of clusters, 'cluster_ary' which contains 'clusters' 51 clusters. Obviously the final number of clusters may not be the 52 same as the desired number of clusters. The function return the 53 threshold similarity which resulted in the clusters being split 54 in 'clusters' clusters. 55 */ 56 57 int i; 58 59 if (target < 2) target = 2; 60 61 if (target > n) target = n; 62 63 // height store minimum similarity values 64 // this is then the iterative incrementing loop 65 for (i = target - 2; i <= n - 1; i++){ 66 clusters[0] = 0; 67 contents.removeAllElements(); 68 sweep_tree (cluster, heights[i], min_c_items, clusters); 69 70 if (clusters[0] >= target) break; 71 } 72 73 // if target - 2 > n - 1, i.e. if the previous loop has not been 74 // entered at all 75 if (i > n - 1){ 76 /* ignore the min_c_size requirement */ 77 i = target - 1; 78 clusters[0] = 0; 79 contents.removeAllElements(); 80 sweep_tree (cluster, heights[i], 1, clusters); 81 } 82 83 return heights[i]; 101 84 } 102 103 85 } -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Clustering.java
r8189 r8281 1 1 package vishnu.cluster; 2 2 3 /* Related word hierarchical clustering using a given triangle similarity matrix */ 3 import java.util.*; 4 4 5 import java.util.*;6 5 7 6 public class Clustering … … 12 11 public static final int BUCKSHOT = 0; 13 12 public static final int COMPLETE = 1; 14 15 private Vector [] theResult;16 private double [][] theCentroids;17 18 public Vector [] get TheResults()13 14 private Vector [] clusters; 15 private double [][] centroids; 16 17 public Vector [] getClusters() 19 18 { 20 return theResult;19 return clusters; 21 20 } 22 21 23 public double[][] get TheCentroids()22 public double[][] getCentroids() 24 23 { 25 return theCentroids;24 return centroids; 26 25 } 27 28 29 public Clustering (int linkage, int [][]m, 30 int num_of_vectors, 31 int arraySize, int min_c_items, 32 int target_clusters, int limit) 26 27 28 public Clustering (int linkage, float[][] KDMatrix, int min_c_items, int target_clusters, int limit) 33 29 { 34 35 30 SimMatrix sim_matrix; 36 31 LinkFunc linkage_func=null; 37 32 38 // number of keywords 39 int VECTOR_SIZE = arraySize;33 int wordCount = KDMatrix[0].length; 34 int hitDocs = KDMatrix.length; 40 35 41 36 Cluster toplevel_cluster; 42 37 ClusterArray splitclusters = null; 43 TheInt clusters = new TheInt(0);38 int[] n_clusters = new int[]{0}; 44 39 45 40 double threshold; … … 48 43 49 44 double[] heights; 50 double [][] vectors = intToDoubleMatrix(m); 51 int [] coding = new int[m.length];45 46 int[] coding = new int[hitDocs]; 52 47 53 // populate array with numbers from 0 to #docs-1 54 for (int c =0;c<m.length;c++) coding[c]=c;48 49 for (int c = 0; c < hitDocs; c++) coding[c] = c; 55 50 56 int i, j; 51 switch (linkage){ 52 case COMPLETE_LINKAGE: 53 linkage_func = new MaxLink(); 54 break; 55 case AVERAGE_LINKAGE: 56 linkage_func = new AveLink(); 57 break; 58 case SINGLE_LINKAGE: 59 linkage_func = new MinLink(); 60 break; 61 default: 62 System.exit(0); 63 } 57 64 58 System.out.println("target_clusters " + target_clusters + " limit " + limit); 59 switch (linkage) 60 { 61 case COMPLETE_LINKAGE: 62 linkage_func = new MaxLink(); 63 break; 64 case AVERAGE_LINKAGE: 65 linkage_func = new AveLink(); 66 break; 67 case SINGLE_LINKAGE: 68 linkage_func = new MinLink(); 69 break; 70 default: 71 System.out.println("clustering.c: main(): linkage type "+ 72 linkage +" not defined."); 73 System.exit(0); 74 } 65 66 /**** make sure upper bound does not exceed #docs ****/ 67 68 if(limit > hitDocs) limit = hitDocs; 69 limit = (int)hitDocs/2; 75 70 76 long startTime = System.currentTimeMillis();77 long readTime = System.currentTimeMillis();78 79 80 // if fewer documents than upper cluster limit81 // I think this is an error, should be vectors.length, viz. #documents82 // which was never spotted as limit is always smaller83 // than #keywords84 85 if(limit > vectors[0].length) limit = vectors[0].length;86 71 87 // contains distance matrix between documents, built using 88 // the double vector[i][] list of keyword frequencies 89 90 sim_matrix = new SimMatrix(vectors, limit, sim_func, VECTOR_SIZE); 91 92 long simMatixTime = System.currentTimeMillis(); 93 System.out.println("hierarchical clustering " + limit + "...." ); 94 //System.out.flush(); 95 96 /* an array of heights sorted by descending disim */ 97 // as many heights as there are docs 98 heights = new double[m.length];//[limit - 1]; 99 100 72 /**** get similarity matrix of the first limit docs with each other ****/ 73 74 sim_matrix = new SimMatrix(KDMatrix, limit, sim_func); 75 76 heights = new double[hitDocs]; 77 101 78 Hierarchical hierarchical = new Hierarchical(limit); 102 79 103 toplevel_cluster = hierarchical.clustering (sim_matrix, limit, heights, 104 linkage_func); 105 106 double d[] = toplevel_cluster.getCentroid(); 107 if( d != null ) 108 System.out.println("First element of centroid: " + d[0]); 109 else 110 System.out.println("Centroid null"); 80 toplevel_cluster = hierarchical.clustering(sim_matrix, limit, heights, linkage_func); 81 if( toplevel_cluster == null ) 82 return; 111 83 112 long hierarchicalTime = System.currentTimeMillis(); 84 System.out.println("Clustered items: " + toplevel_cluster.items); 85 Cluster c1 = toplevel_cluster.child_1; 86 Cluster c2 = toplevel_cluster.child_2; 87 System.out.println("Child1: " + c1.items); 88 System.out.println("Child2: " + c2.items); 89 90 splitclusters = new ClusterArray(0); 113 91 114 System.out.println("split into"); 115 //System.out.flush(); 116 splitclusters=new ClusterArray(0); 117 118 threshold = splitclusters.split(toplevel_cluster, heights, limit, 119 target_clusters, 120 min_c_items, clusters); 92 threshold = splitclusters.split(toplevel_cluster, heights, limit, target_clusters, min_c_items, n_clusters); 121 93 122 long splitTime = System.currentTimeMillis(); 123 System.out.println(clusters.val + " clusters threshold (" + 124 threshold + ").... "); 125 //System.out.flush(); 94 System.out.println(n_clusters[0] + " clusters"); 95 System.out.println("Threshold: " + threshold); 96 97 System.out.println("Target_clusters " + target_clusters); 98 System.out.println("Limit " + limit); 99 System.out.println("#docs: " + hitDocs); 100 101 Iterative iter = new Iterative(wordCount); 102 103 iter.clustering(splitclusters, n_clusters, KDMatrix, limit, hitDocs, sim_func); 104 105 System.out.println("Finished iterative clustering"); 126 106 127 System.out.println("iterative clustering............."); 128 System.out.println("target_clusters " + target_clusters + " limit " + 129 limit + " vectors length " + vectors.length); 130 131 System.out.flush(); 132 System.out.println("Cluster Length: " + clusters.val); 133 Iterative iter = new Iterative(VECTOR_SIZE); 134 135 iter.clustering (splitclusters, clusters, vectors, limit, 136 vectors.length, sim_func); 137 long iterativeTime = System.currentTimeMillis(); 107 clusters = new Vector[n_clusters[0]]; 108 centroids = new double[n_clusters[0]][wordCount]; 138 109 139 theResult = new Vector [clusters.val]; 140 theCentroids = new double[clusters.val][arraySize]; 141 142 for (i = 0; i < clusters.val; i++) 143 { 144 theResult[i]=new Vector(); 145 Cluster clust = (Cluster)splitclusters.contents.elementAt(i); 146 clust.addChildrenToOutputVector(coding,theResult[i]); 147 148 for (j = 0; j < clust._iter_items_num; j++) 149 { 150 TheInt ii = (TheInt)clust._iter_items.elementAt(j); 151 Integer jj = new Integer (coding[ii.val]); 152 theResult[i].addElement(jj); 153 } 154 double [] centroid = clust.getCentroid(); 155 for (j = 0; j < arraySize; j++) 156 { 157 theCentroids[i][j]=centroid[j]; 158 } 159 } 160 161 long printTime = System.currentTimeMillis(); 162 System.out.println(" Vector size " + vectors.length + " limit " + limit + " Clusters # " + clusters.val); 163 double printTimeSecs = (printTime-iterativeTime)/1000.0; 164 double iterativeTimeSecs = (iterativeTime - splitTime)/1000.0; 165 double splitTimeSecs = (splitTime - hierarchicalTime)/1000.0; 166 double hierarchicalTimeSecs = (hierarchicalTime - simMatixTime)/1000.0; 167 double simMatixTimeSecs = (simMatixTime - readTime)/1000.0; 168 double readTimeSecs = (readTime - startTime)/1000.0; 169 System.out.println("Timings:"); 170 //System.out.println("Read from file: "+ readTimeSecs+" seconds"); 171 System.out.println("Create Similarity matrix: "+ simMatixTimeSecs+" seconds"); 172 System.out.println("Hierarchical Clustering: "+ hierarchicalTimeSecs+" seconds"); 173 System.out.println("Split Clusters: "+ splitTimeSecs+" seconds"); 174 System.out.println("Iterative Clustering: "+ iterativeTimeSecs+" seconds"); 175 System.out.println("Build results: "+ printTimeSecs+" seconds"); 110 for (int i = 0; i < n_clusters[0]; i++){ 111 clusters[i]=new Vector(); 112 Cluster clust = (Cluster)splitclusters.contents.elementAt(i); 113 clust.addChildrenToOutputVector(coding,clusters[i]); 114 115 for (int j = 0; j < clust.iter_items_num; j++){ 116 Integer ii = (Integer)clust.iter_items.elementAt(j); 117 Integer jj = new Integer (coding[ii.intValue()]); 118 clusters[i].addElement(jj); 119 } 120 121 float [] centroid = clust.getCentroid(); 122 123 for (int j = 0; j < wordCount; j++){ 124 centroids[i][j]=centroid[j]; 125 } 126 } 176 127 } 177 178 static double[][] intToDoubleMatrix(int [][] m)179 {180 int h=m.length;181 double [][] res = null;182 if (h > 0)183 {184 int w = m[0].length;185 res = new double[h][w];186 for(int c=0;c<h;c++)187 for(int d = 0; d<w; d++)188 res[c][d]=(double)m[c][d];189 }190 return res;191 }192 193 128 } -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Distance.java
r8189 r8281 5 5 public abstract class Distance 6 6 { 7 public abstract double get(double[] v1, double[] v2, int size);7 public abstract float get(float[] v1, float[] v2); 8 8 } 9 9 -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/EuclideanDistance.java
r8189 r8281 3 3 public class EuclideanDistance extends Distance 4 4 { 5 public double get(double[] v1, double[] v2, int size)5 public float get(float[] v1, float[] v2) 6 6 { 7 double sum = 0, x; 8 //System.out.println("Array length "+ v1.length); 9 for(int c=0;c<v1.length;c++) 10 { 11 x = v1[c] - v2[c]; 7 float sum = 0; 8 9 for(int c=0;c<v1.length;c++){ 10 float x = v1[c] - v2[c]; 12 11 sum += x*x; 13 12 } 14 return Math.sqrt(sum);13 return (float)Math.sqrt(sum); 15 14 } 16 15 } -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Hierarchical.java
r8189 r8281 3 3 public class Hierarchical 4 4 { 5 public double[] minrow; 6 public int[] colind; 7 //public SimMatrix smatrix; 8 public TriangleIndex triangle; 9 10 public Hierarchical(int matrix_rows) 11 { 12 // create Triangle with "limit" arg = maximum #clusters 13 // an object to help with indexing the cells 14 // in a triangular matrix 15 triangle = new TriangleIndex(matrix_rows); 16 } 17 5 public float[] minrow; 6 public int[] colind; 7 public TriangleIndex triangle; 8 9 public Hierarchical(int matrix_rows){ 10 // create Triangle with "limit" arg = maximum #clusters 11 // an object to help with indexing the cells 12 // in a triangular matrix 13 triangle = new TriangleIndex(matrix_rows); 14 } 15 16 18 17 // set each minrow[i] to the value of the minimum element of that row 19 20 21 18 void set_minrow (SimMatrix m, int row) 19 { 20 // triangular matrix without diagonal, hence zeroth row is all zero 22 21 if(row == 0) return; 23 24 25 22 23 int i; 24 int index_start, index_end; 26 25 27 26 // first column 28 index_start = triangle.index (row, 0); 29 // last column (row(!)-1 because as many rows as columns) 30 index_end = triangle.index (row, row - 1); 31 32 minrow[row] = m.matrix[index_start]; 33 colind[row] = index_start; 34 35 for (i = index_start + 1; i <= index_end; i++) 36 { 37 if (m.matrix[i] < minrow[row]) 38 { 39 minrow[row] = m.matrix[i]; 40 colind[row] = i; 41 } 42 } 27 index_start = triangle.index (row, 0); 28 29 // last column (row(!)-1 because as many rows as columns) 30 index_end = triangle.index (row, row - 1); 31 32 minrow[row] = m.matrix[index_start]; 33 colind[row] = index_start; 34 35 for (i = index_start + 1; i <= index_end; i++){ 36 if (m.matrix[i] < minrow[row]){ 37 minrow[row] = m.matrix[i]; 38 colind[row] = i; 39 } 43 40 } 44 41 } 42 43 45 44 // return the ROW in which the minimum disimilarity resides 46 int get_minrow (int last_row) 47 { 48 int i; 49 int row; 50 double value; 51 52 row = 1; 53 value = minrow[1]; 54 55 for (i = 2; i <= last_row; i++) 56 { 57 if (minrow[i] < value) 58 { 59 row = i; 60 value = minrow[i]; 61 } 45 int get_minrow (int last_row){ 46 int i; 47 int row; 48 float value; 49 50 row = 1; 51 value = minrow[1]; 52 53 for (i = 2; i <= last_row; i++){ 54 if (minrow[i] < value){ 55 row = i; 56 value = minrow[i]; 57 } 62 58 } 63 64 return row; 59 60 return row; 61 } 62 63 64 // num_of_rows is "limit" = max docs to be clustered in hierarchical clustering 65 Cluster clustering (SimMatrix sim_matrix, int num_of_rows, double[] heights, LinkFunc linkage_func) 66 { 67 Cluster child_1; 68 Cluster child_2; 69 Cluster parent=null; 70 ClusterArray cluster_ary; 71 72 int min_row, index, row, col, i; 73 double sim; 74 75 int last_row_index, sim_matrix_size; 76 77 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2; 78 79 // Place each of the documents in a cluster of each own 80 // there are vector.length docs but only num_of_rows = limit places in the array 81 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest 82 cluster_ary = new ClusterArray (num_of_rows); 83 last_row_index = num_of_rows - 1; 84 85 // minimum distance for each row of the sim_matrix 86 minrow = new float[num_of_rows]; 87 88 // column index for minimum distance entry 89 colind = new int[num_of_rows]; 90 91 for (i = 1; i < num_of_rows; i++) set_minrow(sim_matrix, i); 92 93 // do all the hierarchical clustering by running over all rows, updating the matrix 94 // as a new cluster is formed each time 95 for (i = 0; i < num_of_rows - 1; i++){ 96 97 // find the row with the lowest disimilarity 98 min_row = get_minrow(last_row_index); 99 sim = minrow[min_row]; 100 101 System.out.println("Min sim: " + sim); 102 103 // the column with the lowest distance in that row 104 index = colind[min_row]; 105 106 // array of heights, DESCENDING disim, i.e. sim goes last 107 heights[num_of_rows - 2 - i] = sim; 108 109 row = triangle.i_ind[index]; 110 col = triangle.j_ind[index]; 111 112 // merges rows and cols to create a smaller sim_matrix 113 merge (row, col, sim_matrix, last_row_index, linkage_func); 114 System.out.println("Merging: " + row + " and " + col); 115 116 sim_matrix_size -= last_row_index; 117 118 child_1 = (Cluster)cluster_ary.contents.elementAt(row); 119 child_2 = (Cluster)cluster_ary.contents.elementAt(col); 120 121 // parents get id = -1 to indicate that they are not representing single docs 122 // all leaves have an id (1 - num_rows) that points to the doc they contain 123 int sum = child_1.items + child_2.items; 124 System.out.println("Create new parent with " + child_1.items + " + " + child_2.items + " leaves"); 125 parent = new Cluster (-1, null, child_1, child_2, sim, child_1.items + child_2.items); 126 127 child_1.parent = parent; 128 child_2.parent = parent; 129 130 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row); 131 cluster_ary.contents.setElementAt(parent,col); 132 133 last_row_index--; 65 134 } 66 67 // num_of_rows is "limit" = max #clusters 68 Cluster clustering (SimMatrix sim_matrix, int num_of_rows, 69 double[] heights, LinkFunc linkage_func) 70 { 71 Cluster child_1; 72 Cluster child_2; 73 Cluster parent=null; 74 ClusterArray cluster_ary; 75 76 77 int min_row, index, row, col, i; 78 double sim; 79 80 int last_row_index, sim_matrix_size; 81 82 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2; 83 84 // Place each of the documents in a cluster of each own 85 // there are vector.length docs but only num_of_rows = limit places in the array 86 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest 87 cluster_ary = new ClusterArray (num_of_rows); 88 last_row_index = num_of_rows - 1; 89 90 // a variable that is a double and exists for each row - what can this be? 91 // the lowest dissimilarity score? 92 minrow = new double[num_of_rows]; 93 94 // a variable that is an int and exists for each row - what can that be? 95 // the document index of the document with that lowest score? 96 colind = new int[num_of_rows]; 97 98 for (i = 1; i < num_of_rows; i++) 99 { 100 set_minrow(sim_matrix, i); 101 } 102 103 // do all the hierarchical clustering by running over all rows, updating the matrix 104 // as a new cluster is formed each time 105 for (i = 0; i < num_of_rows - 1; i++) 106 { 107 // find the row with the lowest disimilarity 108 min_row = get_minrow(last_row_index); 109 sim = minrow[min_row]; 110 111 // the column with the lowest dis... (sim) in that row 112 index = colind[min_row]; 113 114 // array of heights, DESCENDING disim, i.e. sim goes last 115 heights[num_of_rows - 2 - i] = sim; 116 117 row = triangle.i_ind[index]; 118 col = triangle.j_ind[index]; 119 120 merge (row, col, sim_matrix, last_row_index, linkage_func); 121 122 sim_matrix_size -= last_row_index; 123 124 child_1 = (Cluster)cluster_ary.contents.elementAt(row); 125 child_2 = (Cluster)cluster_ary.contents.elementAt(col); 126 127 // parents get id = -1 to indicate that they are not representing single docs 128 // all leaves have an id (1 - num_rows) that points to the doc they contain 129 parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items); 130 131 child_1._parent = parent; 132 child_2._parent = parent; 133 134 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row); 135 cluster_ary.contents.setElementAt(parent,col); 136 137 last_row_index--; 138 } 139 140 return parent; 141 142 } 143 135 136 return parent; 137 } 138 144 139 // form a new cluster from the two most similar documents 145 140 void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func) 146 { 147 /* i and j are the matrix elements with maximum similarity */ 148 /* matrix is the triangle similarity matrix with last row index n */ 149 /* linkage_func is a pointer to the linkage function */ 150 151 /* i > j */ 152 153 int c, k, index1, index2; 154 141 { 142 143 /* i and j are the matrix elements with maximum similarity */ 144 /* matrix is the triangle similarity matrix with last row index n */ 145 /* linkage_func is a pointer to the linkage function */ 146 147 /* i > j */ 148 149 int c, k, index1, index2; 150 155 151 // go over all rows 156 for (k = 0; k <= n; k++) 157 { 152 for (k = 0; k <= n; k++){ 153 158 154 // if k < j the affected elements are on row j 159 if (k < j) 160 { 161 index1 = triangle.index (k, j); 162 index2 = triangle.index (k, i); 163 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 164 165 //printf ("1 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 166 } 167 else if (k > j && k < i) 168 { 169 /* if j < k < i then only one element is updated per row */ 170 index1 = triangle.index (k, j); 171 index2 = triangle.index (k, i); 172 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 173 //printf ("2 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, m.matrix[index2]); 174 175 if (index1 == colind[k]) 176 { 177 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 178 set_minrow(m, k); 179 } 180 else 181 { 182 /* if the changed element is smaller than the current minimum update minrow and colind */ 183 if (m.matrix[index1] < minrow[k]) 184 { 185 minrow[k] = m.matrix[index1]; 186 colind[k] = index1; 187 } 188 } 189 } 190 else if (k > i && k < n) 191 { 192 index1 = triangle.index (k, j); 193 index2 = triangle.index (k, i); 194 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 195 //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 196 197 if (index1 == colind[k]) 198 { 199 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 200 set_minrow(m, k); 201 } 202 else 203 { 204 /* if the changed element is smaller than the current minimum update minrow and colind */ 205 if (m.matrix[index1] < minrow[k]) 206 { 207 minrow[k] = m.matrix[index1]; 208 colind[k] = index1; 209 } 210 } 211 212 index1 = triangle.index (k, i); 213 index2 = triangle.index (k, n); 214 m.matrix[index1] = m.matrix[index2]; 215 //printf (" copy [%d, %d] to [%d, %d] = %f\n", k, n, k, i, matrix[index2]); 216 217 if (index1 == colind[k]) 218 { 219 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 220 set_minrow(m, k); 221 } 222 else 223 { 224 /* if the changed element is smaller than the current minimum update minrow and colind */ 225 if (m.matrix[index1] < minrow[k]) 226 { 227 minrow[k] = m.matrix[index1]; 228 colind[k] = index1; 229 } 230 } 231 } 232 else if (k == n && i != n) 233 { 234 index1 = triangle.index (k, j); 235 index2 = triangle.index (k, i); 236 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 237 //printf ("5 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 238 239 if (index1 == colind[k]) 240 { 241 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 242 set_minrow(m, k); 243 } 244 else 245 { 246 /* if the changed element is smaller than the current minimum update minrow and colind */ 247 if (m.matrix[index1] < minrow[k]) 248 { 249 minrow[k] = m.matrix[index1]; 250 colind[k] = index1; 251 } 252 } 253 254 for (c = 0; c < i; c++) 255 { 256 index1 = triangle.index (c, i); 257 index2 = triangle.index (c, n); 258 m.matrix[index1] = m.matrix[index2]; 259 //printf ("copy [%d, %d] to [%d, %d] = %f\n", c, n, c, i, matrix[index2]); 260 261 } 262 263 } 155 if (k < j){ 156 index1 = triangle.index (k, j); 157 index2 = triangle.index (k, i); 158 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 159 }else if (k > j && k < i){ 160 /* if j < k < i then only one element is updated per row */ 161 index1 = triangle.index (k, j); 162 index2 = triangle.index (k, i); 163 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 164 165 if (index1 == colind[k]){ 166 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 167 set_minrow(m, k); 168 }else{ 169 /* if the changed element is smaller than the current minimum update minrow and colind */ 170 if (m.matrix[index1] < minrow[k]){ 171 minrow[k] = m.matrix[index1]; 172 colind[k] = index1; 173 } 174 } 175 }else if (k > i && k < n){ 176 index1 = triangle.index (k, j); 177 index2 = triangle.index (k, i); 178 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 179 //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]); 180 181 if (index1 == colind[k]){ 182 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 183 set_minrow(m, k); 184 }else{ 185 /* if the changed element is smaller than the current minimum update minrow and colind */ 186 if (m.matrix[index1] < minrow[k]){ 187 minrow[k] = m.matrix[index1]; 188 colind[k] = index1; 189 } 190 } 191 192 index1 = triangle.index (k, i); 193 index2 = triangle.index (k, n); 194 m.matrix[index1] = m.matrix[index2]; 195 196 if (index1 == colind[k]){ 197 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 198 set_minrow(m, k); 199 }else{ 200 /* if the changed element is smaller than the current minimum update minrow and colind */ 201 if (m.matrix[index1] < minrow[k]){ 202 minrow[k] = m.matrix[index1]; 203 colind[k] = index1; 204 } 205 } 206 }else if(k == n && i != n){ 207 index1 = triangle.index (k, j); 208 index2 = triangle.index (k, i); 209 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]); 210 211 if (index1 == colind[k]){ 212 /* the previous minimum of the row has been changed so recaculate minimum of whole row */ 213 set_minrow(m, k); 214 }else{ 215 /* if the changed element is smaller than the current minimum update minrow and colind */ 216 if (m.matrix[index1] < minrow[k]){ 217 minrow[k] = m.matrix[index1]; 218 colind[k] = index1; 219 } 220 } 221 222 for (c = 0; c < i; c++){ 223 index1 = triangle.index (c, i); 224 index2 = triangle.index (c, n); 225 m.matrix[index1] = m.matrix[index2]; 226 } 227 228 } 264 229 } 265 set_minrow(m, i); 266 set_minrow(m, j); 267 } 268 230 set_minrow(m, i); 231 set_minrow(m, j); 232 } 269 233 } 270 234 -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Iterative.java
r8189 r8281 6 6 { 7 7 Vector vector; 8 int VECTOR_SIZE;9 8 int wordCount; 9 10 10 Iterative(int vsize) 11 11 { 12 VECTOR_SIZE= vsize;12 wordCount = vsize; 13 13 } 14 14 15 15 public void printVector(double[] a) 16 16 { 17 17 int i; 18 19 for (i = 0; i < VECTOR_SIZE; i++)18 19 for (i = 0; i < wordCount; i++) 20 20 System.out.println("v[" + i + "] = " + a[i]); 21 21 } 22 22 23 23 24 void cluster_center (Cluster cluster, double [][] vectors, double[] centroid) 24 void cluster_center (Cluster cluster, float [][] vectors, float[] centroid) 25 { 26 int i; 27 28 if (cluster.id == -1){ 29 cluster_center (cluster.child_1, vectors, centroid); 30 cluster_center (cluster.child_2, vectors, centroid); 31 }else{ 32 float[] doubleArray = vectors[cluster.id]; 33 for (i = 0; i < doubleArray.length; i++) 34 centroid[i] += doubleArray[i]; 35 } 36 } 37 38 void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, float [][] vectors) 39 { 40 int i, j; double ssum; 41 Cluster clust; 42 43 44 /**** for each hierarchical cluster, compute centroid ****/ 45 for (i = 0; i < num_of_clusters; i++){ 46 clust = (Cluster)clusters.contents.elementAt(i); 47 clust.centroid = new float[wordCount]; 48 cluster_center (clust, vectors, clust.centroid); 49 50 for (ssum = j = 0; j < wordCount; j++) 51 ssum += clust.centroid[j] * clust.centroid[j]; 52 53 if(ssum>0) // normalise 54 for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < wordCount; j++) 55 clust.centroid[j] *= ssum; 56 } 57 } 58 59 60 void place_item (ClusterArray clusters, int num_of_clusters, float [][] vectors, int item_id, Distance sim_func) 61 { 62 Cluster clust = (Cluster)clusters.contents.elementAt(0); 63 double minDist = sim_func.get(clust.centroid,vectors[item_id]); 64 int x_minDist = 0; 65 66 for (int i = 1; i < num_of_clusters; i++){ 67 clust = (Cluster)clusters.contents.elementAt(i); 68 double dist = sim_func.get(clust.centroid,vectors[item_id]); 69 70 if (dist < minDist){ 71 x_minDist = i; 72 minDist = dist; 73 } 74 } 75 76 //System.out.println("in " + x_minDist); 77 clust = (Cluster)clusters.contents.elementAt(x_minDist); 78 clust.iter_items.addElement(new Integer(item_id)); 79 clust.iter_items_num++; 80 } 81 82 83 // add all documents not yet added to the existing cluster hierarchy (which is based on the 84 // limit first documents. Hence there are vector.length - limit docs to add 85 // consequently index runs from limit (start_index) to vector.length (end_index) 86 void clustering (ClusterArray clusters, int[] num_of_clusters, float[][] vectors, int start_index, int end_index,Distance sim_func) 25 87 { 26 88 int i; 27 89 28 if (cluster._id == -1) 29 { 30 cluster_center (cluster._child_1, vectors, centroid); 31 cluster_center (cluster._child_2, vectors, centroid);32 } 33 else 34 { 35 double[] doubleArray = vectors[cluster._id]; 36 for (i = 0; i < doubleArray.length; i++) 37 centroid[i] += doubleArray[i];38 90 System.out.println("Start: " + start_index + " Finish: " + end_index); 91 92 if (start_index >= end_index) 93 return; 94 95 compute_cluster_centers(clusters, num_of_clusters[0], vectors); 96 97 for (i = start_index; i < end_index; i++){ 98 System.out.println("Placing document " + i); 99 place_item(clusters, num_of_clusters[0], vectors, i, sim_func); 100 } 39 101 } 40 41 void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, double [][] vectors)42 {43 int i, j; double ssum;44 Cluster clust;45 46 for (i = 0; i < num_of_clusters; i++)47 {48 clust = (Cluster)clusters.contents.elementAt(i);49 clust._centroid = new double[VECTOR_SIZE];50 cluster_center (clust, vectors, clust._centroid);51 for (ssum = j = 0; j < VECTOR_SIZE; j++)52 ssum += clust._centroid[j] * clust._centroid[j];53 if(ssum>0) // normalise54 for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < VECTOR_SIZE; j++)55 clust._centroid[j] *= ssum;56 }57 }58 59 void place_item (ClusterArray clusters,60 int num_of_clusters,61 double [][] vectors,62 int item_id, Distance sim_func)63 {64 double max_sim, sim;65 int max_sim_index;66 67 int i;68 69 max_sim_index = 0;70 Cluster clust = (Cluster)clusters.contents.elementAt(0);71 max_sim = sim_func.get(clust._centroid,72 vectors[item_id], VECTOR_SIZE);73 74 for (i = 1; i < num_of_clusters; i++)75 {76 clust = (Cluster)clusters.contents.elementAt(i);77 sim = sim_func.get(clust._centroid,78 vectors[item_id], VECTOR_SIZE);79 80 if (sim < max_sim)81 {82 max_sim_index = i;83 max_sim = sim;84 }85 }86 87 clust = (Cluster)clusters.contents.elementAt(max_sim_index);88 clust._iter_items.addElement(new TheInt(item_id));89 clust._iter_items_num++;90 }91 92 // add all documents not yet added to the existing cluster hierarchy (which is based on the93 // limit first documents. Hence there are vector.length - limit docs to add94 // consequently index runs from limit (start_index) to vector.length (end_index)95 void clustering (ClusterArray clusters, TheInt num_of_clusters,96 double[][] vectors, int start_index, int end_index,97 Distance sim_func)98 {99 int i;100 101 if (start_index >= end_index)102 return;103 104 compute_cluster_centers(clusters, num_of_clusters.val, vectors);105 106 for (i = start_index; i < end_index; i++)107 place_item(clusters, num_of_clusters.val, vectors, i, sim_func);108 }109 110 102 } -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/LinkFunc.java
r8189 r8281 5 5 public abstract class LinkFunc 6 6 { 7 abstract double link(double a, doubleb);7 abstract float link(float a, float b); 8 8 } -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MaxLink.java
r8189 r8281 3 3 public class MaxLink extends LinkFunc 4 4 { 5 public double link(double a, doubleb)5 public float link(float a, float b) 6 6 { 7 7 return (a > b? a*99+b: b*99+a)/100; -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MinLink.java
r8189 r8281 3 3 public class MinLink extends LinkFunc 4 4 { 5 public double link(double a, doubleb)5 public float link(float a, float b) 6 6 { 7 7 return (a < b? a*999+b: b*999+a)/1000; -
trunk/gsdl3/packages/vishnu/src/vishnu/cluster/SimMatrix.java
r8189 r8281 3 3 import java.util.*; 4 4 5 // kainof DistanceMatrix6 5 public class SimMatrix 7 6 { 8 public double[] matrix; 7 8 public float[] matrix; 9 9 10 10 // "vectors" is the expanded SparseMatrix, i.e. doc x keywords 11 // VECTOR_SIZE = #keywords 12 public SimMatrix(double[][] vectors, 13 int limit, Distance simfunc, int VECTOR_SIZE) 14 { 15 int i,j; 16 double iarray[]; 17 18 // size = #docs 19 int size=vectors.length, index=0; 20 21 // to exclude diagonal and conjugates 22 // - - - - - 23 // | 24 // * | 25 // * * | 26 // * * * | 27 int realSize = (size * (size - 1)/2); 28 matrix=new double[realSize]; 29 System.out.println("Similarity matrix size: " + size); 30 31 for(i = 1; i < size; i++) 11 public SimMatrix(float[][] KDMatrix, int limit, Distance simfunc){ 12 13 float iarray[]; 14 15 int hitDocs = KDMatrix.length; 16 int wordCount = KDMatrix[0].length; 17 int index = 0; 18 19 // to exclude diagonal and conjugates 20 // - - - - - 21 // | 22 // * | 23 // * * | 24 // * * * | 25 26 int realSize = (hitDocs * (hitDocs - 1)/2); 27 matrix = new float[realSize]; 28 29 // this gets the sim matrix for all docs rather 30 // than only the first limit - mistake? 31 32 for(int i = 1; i < hitDocs; i++){ 33 iarray = KDMatrix[i]; 34 for(int j = 0; j < i; j++){ 35 matrix[index] = simfunc.get(iarray,KDMatrix[j]); 36 index++; 37 } 38 } 39 } 40 41 public void printMatrix() 32 42 { 33 iarray=vectors[i]; 34 for(j = 0; j < i; j++) 35 { 36 matrix[index] = simfunc.get(iarray, 37 vectors[j], 38 VECTOR_SIZE); 39 index++; 40 } 43 int countA = 0; 44 int countB = 0; 45 46 for( int i = 0; i < matrix.length; i++ ){ 47 System.out.print(matrix[i] + " "); 48 if( countA >= countB ){ 49 System.out.println(); 50 countB++; 51 countA = 0; 52 } 53 else countA++; 54 } 41 55 } 42 }43 44 56 } 45 57 46 58 59 60 61 62 63 64 65 66 67 68 69 70
Note:
See TracChangeset
for help on using the changeset viewer.