Changeset 8281 for trunk/gsdl3

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/AveLink.java

r8189	r8281
3	3	public class AveLink extends LinkFunc
4	4	{
5		~~double link(double a, double~~ b)
	5	float link(float a, float b)
6	6	{
7	7	return (a+b)/2;

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Cluster.java

-              r8189
+              r8281
 public class Cluster
+{
     public int _id;
     public Cluster  _parent = null;
     public Cluster  _child_1 = null;
     public Cluster  _child_2 = null;
     public double _children_sim;
     // probably the size of the cluster
     public int _items;
     public int _split;
+    public int id;
+    public Cluster  parent = null;
+    public Cluster  child_1 = null;
+    public Cluster  child_2 = null;
+    public double children_sim;
+    // size of cluster
+    public int items;
+    public int split;
     // [] -> centroid for each vector element ?
+    public double[] _centroid=null;
+    public Vector _iter_items = null;
+    public int _iter_items_num = 0;
+    public int _x = 0;
+    public int _y = 0;
+    public double[] getCentroid(){return _centroid;}
+    public Cluster(int id, Cluster parent, Cluster child_1, Cluster child_2, double children_sim, int items)
+    {
+        _id                    = id;
+        _children_sim          = children_sim;
+        _items                 = items;
+        _split                 = 0;
+        _parent                = parent;
+        _iter_items            = new Vector(0);
+    public float[] centroid=null;
+    public Vector iter_items = null;
+    public int iter_items_num = 0;
+    public int x = 0;
+    public int y = 0;
+    public float[] getCentroid(){return centroid;}
+    public Cluster(int id, Cluster parent, Cluster c1, Cluster c2, double children_sim, int items)
+    {
+    this.id               = id;
+    this.children_sim     = children_sim;
+    this.items            = items;
+    split                 = 0;
+    this.parent           = parent;
+        iter_items            = new Vector(0);
         // probably to get a balanced tree
+        if (child_1 != null && child_2 != null)
+        {
+            if (child_2._items > child_1._items)
+            {
+                _child_1 = child_1;
+                _child_2 = child_2;
+            }
+            else
+            {
+                _child_1 = child_2;
+                _child_2 = child_1;
+            }
+        }
+        else // if either or both are null
+        {
+            _child_1 = child_1;
+            _child_2 = child_2;
+    if (c1 != null && c2 != null){
+        if (c2.items > c1.items){
+        child_1 = c1;
+        child_2 = c2;
+        }else{
+            child_1 = c2;
+        child_2 = c1;
+        }
+    }
+    }else{ // if either or both are null
+        child_1 = c1;
+        child_2 = c2;
+    }
+    }
     // a nice recursive function to determine the depth of the hierarchy
+    public int depth()
+    {
+        if (_id != -1)
+            return 0;
+        else
+            return Math.max(1 + _child_1.depth(), 1 + _child_2.depth());
+    public int depth()
+    {
+    if (id != -1)
+        return 0;
+    else
+        return Math.max(1 + child_1.depth(), 1 + child_2.depth());
+    }
+    void addChildrenToOutputVector(int []coding, Vector output)
+    {
+    if (id != -1){
+            Integer ii = new Integer(coding[id]);
+            output.addElement(ii);
+    }else{
+            child_1.addChildrenToOutputVector(coding,output);
+            child_2.addChildrenToOutputVector(coding,output);
+    }
-    void addChildrenToOutputVector(int []coding, Vector output)
+    {
-      if (_id != -1)
+          {
-            Integer ii = new Integer(coding[_id]);
-            output.addElement(ii);
+          }
-      else
+          {
-            _child_1.addChildrenToOutputVector(coding,output);
-            _child_2.addChildrenToOutputVector(coding,output);
+          }
+    }
+}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/ClusterArray.java

-              r8189
+              r8281
 public class ClusterArray
+{
+    Vector contents;
+    ClusterArray(int num_of_clusters)
+    {
+        int i;
+        contents = new Vector(num_of_clusters);
+        for (i = 0; i < num_of_clusters; i++)
+             contents.addElement(new Cluster(i, null, null, null, 0, 1));
+             // i.e. id = i, similarity between kids = 0, #items = 1
+    Vector contents;
+    ClusterArray(int num_of_clusters)
+    {
+    contents = new Vector(num_of_clusters);
+    for(int i = 0; i < num_of_clusters; i++)
+        contents.addElement(new Cluster(i, null, null, null, 0, 1));
+    }
+    // counts the number of clusters that are less dissimilar than disim
+    private void sweep_tree (Cluster cluster, double disim, int min_c_items, int[] clusters)
+    {
+    cluster.split = 0;
+    if (cluster.id == -1){
+        if (cluster.children_sim <= disim && cluster.items >= min_c_items){
+        cluster.split = 1;
+        contents.addElement(cluster);
+        clusters[0]++;
+        }else{
+        sweep_tree (cluster.child_1, disim, min_c_items, clusters);
+        sweep_tree (cluster.child_2, disim, min_c_items, clusters);
+        }
+        }else{
+        if (cluster.items >= min_c_items){
+        cluster.split = 1;
+        contents.addElement(cluster);
+        clusters[0]++;
+        }
+    }
+    // counts the number of clusters that are less dissimilar than disim
+    private void sweep_tree (Cluster cluster, double disim, int min_c_items, TheInt clusters)
+    {
+        cluster._split = 0;
+        if (cluster._id == -1)
+        {
+            if (cluster._children_sim <= disim && cluster._items >= min_c_items)
+            {
+                cluster._split = 1;
+                contents.addElement(cluster);
+                clusters.val++;
+            }
+            else
+            {
+                sweep_tree (cluster._child_1, disim, min_c_items, clusters);
+                sweep_tree (cluster._child_2, disim, min_c_items, clusters);
+            }
+        }
+        else
+        {
+            if (cluster._items >= min_c_items)
+            {
+                cluster._split = 1;
+                contents.addElement(cluster);
+                clusters.val++;
+            }
+        }
+    }
+    }
+    double split (Cluster cluster, double[] heights, int n, int target,
+                   int min_c_items, TheInt clusters)
+    {
+  /*
+     Try to split 'cluster', made up of 'n' vectors, into 'target'
+     clusters by incrementing iteratively the mimimum required
+     similarity between the 'cluster\'s' children. The result is an
+     array of clusters, 'cluster_ary' which contains 'clusters'
+     clusters.  Obviously the final number of clusters may not be the
+     same as the desired number of clusters. The function return the
+     threshold similarity which resulted in the clusters being split
+     in 'clusters' clusters.
+  */
+      int i;
+      if (target < 2)
+              target = 2;
+      if (target > n)
+              target = n;
+      // height store minimum similarity values
+      // this is then the iterative incrementing loop
+      for (i = target - 2; i <= n - 1; i++)
+      {
+          clusters.val = 0; // simple class for an integer to
+                            // be able to pass it by reference (below)
+          contents.removeAllElements();
+          sweep_tree (cluster, heights[i], min_c_items, clusters);
+          if (clusters.val >= target)
+                  break;
+      }
+      // if target - 2 > n - 1, i.e. if the previous loop has not been
+      // entered at all
+      if (i > n - 1)
+      {
+            /* ignore the min_c_size requirement */
+            i = target - 1;
+            clusters.val = 0;
+            contents.removeAllElements();
+            sweep_tree (cluster, heights[i], 1, clusters);
+      }
+      return heights[i];
+    double split (Cluster cluster, double[] heights, int n, int target, int min_c_items, int[] clusters)
+    {
+    /*
+      Try to split 'cluster', made up of 'n' vectors, into 'target'
+      clusters by incrementing iteratively the mimimum required
+      similarity between the 'cluster\'s' children. The result is an
+      array of clusters, 'cluster_ary' which contains 'clusters'
+      clusters.  Obviously the final number of clusters may not be the
+      same as the desired number of clusters. The function return the
+      threshold similarity which resulted in the clusters being split
+      in 'clusters' clusters.
+    */
+    int i;
+    if (target < 2) target = 2;
+    if (target > n) target = n;
+    // height store minimum similarity values
+    // this is then the iterative incrementing loop
+    for (i = target - 2; i <= n - 1; i++){
+        clusters[0] = 0;
+        contents.removeAllElements();
+        sweep_tree (cluster, heights[i], min_c_items, clusters);
+        if (clusters[0] >= target) break;
+    }
+    // if target - 2 > n - 1, i.e. if the previous loop has not been
+    // entered at all
+    if (i > n - 1){
+        /* ignore the min_c_size requirement */
+        i = target - 1;
+        clusters[0] = 0;
+        contents.removeAllElements();
+        sweep_tree (cluster, heights[i], 1, clusters);
+    }
+    return heights[i];
+    }
+}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Clustering.java

-              r8189
+              r8281
 package vishnu.cluster;
+/* Related word hierarchical clustering using a given triangle similarity matrix */
+import java.util.*;
-import java.util.*;
 public class Clustering
 …
     public static final int BUCKSHOT = 0;
     public static final int COMPLETE = 1;
     private Vector [] theResult;
     private double [][] theCentroids;
     public Vector [] getTheResults()
+    private Vector [] clusters;
+    private double [][] centroids;
+    public Vector [] getClusters()
+    {
         return theResult;
+        return clusters;
+    }
     public double[][] getTheCentroids()
+    public double[][] getCentroids()
+    {
         return theCentroids;
+        return centroids;
+    }
+    public Clustering (int linkage, int [][]m,
+               int num_of_vectors,
+               int arraySize, int min_c_items,
+               int target_clusters, int limit)
+    public Clustering (int linkage, float[][] KDMatrix, int min_c_items, int target_clusters, int limit)
+    {
         SimMatrix sim_matrix;
     LinkFunc linkage_func=null;
+    // number of keywords
         int VECTOR_SIZE = arraySize;
+        int wordCount = KDMatrix[0].length;
+    int hitDocs = KDMatrix.length;
     Cluster toplevel_cluster;
         ClusterArray splitclusters = null;
     TheInt clusters = new TheInt(0);
+    int[] n_clusters = new int[]{0};
         double threshold;
 …
     double[] heights;
+        double [][] vectors = intToDoubleMatrix(m);
         int [] coding = new int[m.length];
+        int[] coding = new int[hitDocs];
+        // populate array with numbers from 0 to #docs-1
         for (int c=0;c<m.length;c++) coding[c]=c;
+        for (int c = 0; c < hitDocs; c++) coding[c] = c;
+    int i, j;
+    switch (linkage){
+    case COMPLETE_LINKAGE:
+        linkage_func = new MaxLink();
+        break;
+    case AVERAGE_LINKAGE:
+        linkage_func = new AveLink();
+        break;
+    case SINGLE_LINKAGE:
+        linkage_func = new MinLink();
+        break;
+    default:
+        System.exit(0);
+    }
+        System.out.println("target_clusters " + target_clusters + " limit " + limit);
+    switch (linkage)
+        {
+            case COMPLETE_LINKAGE:
+            linkage_func = new MaxLink();
+            break;
+            case AVERAGE_LINKAGE:
+            linkage_func = new AveLink();
+            break;
+            case SINGLE_LINKAGE:
+            linkage_func = new MinLink();
+            break;
+            default:
+            System.out.println("clustering.c: main(): linkage type "+
+                   linkage +" not defined.");
+                System.exit(0);
+        }
+        /**** make sure upper bound does not exceed #docs ****/
+    if(limit > hitDocs) limit = hitDocs;
+    limit = (int)hitDocs/2;
-        long startTime = System.currentTimeMillis();
-        long readTime = System.currentTimeMillis();
-        // if fewer documents than upper cluster limit
-        // I think this is an error, should be vectors.length, viz. #documents
-        // which was never spotted as limit is always smaller
-        // than #keywords
-    if(limit > vectors[0].length) limit = vectors[0].length;
+        // contains distance matrix between documents, built using
+        // the double vector[i][] list of keyword frequencies
+        sim_matrix = new SimMatrix(vectors, limit, sim_func, VECTOR_SIZE);
+        long simMatixTime = System.currentTimeMillis();
+        System.out.println("hierarchical clustering " + limit + "...." );
+        //System.out.flush();
+        /* an array of heights sorted by descending disim */
+        // as many heights as there are docs
+        heights = new double[m.length];//[limit - 1];
+    /**** get similarity matrix of the first limit docs with each other ****/
+        sim_matrix = new SimMatrix(KDMatrix, limit, sim_func);
+    heights = new double[hitDocs];
         Hierarchical hierarchical = new Hierarchical(limit);
+        toplevel_cluster = hierarchical.clustering (sim_matrix, limit, heights,
+                                                    linkage_func);
+    double d[] = toplevel_cluster.getCentroid();
+    if( d != null )
+        System.out.println("First element of centroid: " + d[0]);
+    else
+        System.out.println("Centroid null");
+        toplevel_cluster = hierarchical.clustering(sim_matrix, limit, heights, linkage_func);
+    if( toplevel_cluster == null )
+        return;
+    long hierarchicalTime = System.currentTimeMillis();
+        System.out.println("Clustered items: " + toplevel_cluster.items);
+    Cluster c1 = toplevel_cluster.child_1;
+    Cluster c2 = toplevel_cluster.child_2;
+    System.out.println("Child1: " + c1.items);
+    System.out.println("Child2: " + c2.items);
+    splitclusters = new ClusterArray(0);
+    System.out.println("split into");
+    //System.out.flush();
+        splitclusters=new ClusterArray(0);
+    threshold = splitclusters.split(toplevel_cluster, heights, limit,
+                    target_clusters,
+                    min_c_items, clusters);
+    threshold = splitclusters.split(toplevel_cluster, heights, limit, target_clusters, min_c_items, n_clusters);
+        long splitTime = System.currentTimeMillis();
+        System.out.println(clusters.val + " clusters threshold (" +
+                                              threshold  + ").... ");
+        //System.out.flush();
+        System.out.println(n_clusters[0] + " clusters");
+    System.out.println("Threshold: " + threshold);
+        System.out.println("Target_clusters " + target_clusters);
+    System.out.println("Limit " + limit);
+    System.out.println("#docs: " + hitDocs);
+        Iterative iter = new Iterative(wordCount);
+    iter.clustering(splitclusters, n_clusters, KDMatrix, limit, hitDocs, sim_func);
+    System.out.println("Finished iterative clustering");
+    System.out.println("iterative clustering.............");
+        System.out.println("target_clusters " + target_clusters + " limit " +
+                           limit + " vectors length " + vectors.length);
+    System.out.flush();
+    System.out.println("Cluster Length: " + clusters.val);
+        Iterative iter = new Iterative(VECTOR_SIZE);
+    iter.clustering (splitclusters, clusters, vectors, limit,
+                          vectors.length, sim_func);
+        long iterativeTime = System.currentTimeMillis();
+        clusters = new Vector[n_clusters[0]];
+        centroids = new double[n_clusters[0]][wordCount];
+        theResult = new Vector [clusters.val];
+        theCentroids = new double[clusters.val][arraySize];
+    for (i = 0; i < clusters.val; i++)
+        {
+        theResult[i]=new Vector();
+        Cluster clust = (Cluster)splitclusters.contents.elementAt(i);
+        clust.addChildrenToOutputVector(coding,theResult[i]);
+        for (j = 0; j < clust._iter_items_num; j++)
+            {
+                TheInt ii = (TheInt)clust._iter_items.elementAt(j);
+                Integer jj = new Integer (coding[ii.val]);
+                theResult[i].addElement(jj);
+            }
+            double [] centroid = clust.getCentroid();
+            for (j = 0; j < arraySize; j++)
+            {
+                  theCentroids[i][j]=centroid[j];
+            }
+        }
+        long printTime = System.currentTimeMillis();
+        System.out.println(" Vector size  " + vectors.length + " limit " + limit + " Clusters # " + clusters.val);
+        double printTimeSecs = (printTime-iterativeTime)/1000.0;
+        double iterativeTimeSecs = (iterativeTime - splitTime)/1000.0;
+        double splitTimeSecs = (splitTime - hierarchicalTime)/1000.0;
+        double hierarchicalTimeSecs = (hierarchicalTime - simMatixTime)/1000.0;
+        double simMatixTimeSecs = (simMatixTime - readTime)/1000.0;
+        double readTimeSecs = (readTime - startTime)/1000.0;
+        System.out.println("Timings:");
+        //System.out.println("Read from file: "+ readTimeSecs+" seconds");
+        System.out.println("Create Similarity matrix: "+ simMatixTimeSecs+" seconds");
+        System.out.println("Hierarchical Clustering: "+ hierarchicalTimeSecs+" seconds");
+        System.out.println("Split Clusters: "+ splitTimeSecs+" seconds");
+        System.out.println("Iterative Clustering: "+ iterativeTimeSecs+" seconds");
+        System.out.println("Build results: "+ printTimeSecs+" seconds");
+    for (int i = 0; i < n_clusters[0]; i++){
+        clusters[i]=new Vector();
+        Cluster clust = (Cluster)splitclusters.contents.elementAt(i);
+        clust.addChildrenToOutputVector(coding,clusters[i]);
+        for (int j = 0; j < clust.iter_items_num; j++){
+        Integer ii = (Integer)clust.iter_items.elementAt(j);
+        Integer jj = new Integer (coding[ii.intValue()]);
+        clusters[i].addElement(jj);
+        }
+        float [] centroid = clust.getCentroid();
+        for (int j = 0; j < wordCount; j++){
+        centroids[i][j]=centroid[j];
+        }
+    }
+    }
-    static double[][] intToDoubleMatrix(int [][] m)
+    {
-        int h=m.length;
-        double [][] res = null;
-        if (h > 0)
+        {
-          int w = m[0].length;
-          res = new double[h][w];
-          for(int c=0;c<h;c++)
-            for(int d = 0; d<w; d++)
-              res[c][d]=(double)m[c][d];
+        }
-        return res;
+   }
+}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Distance.java

r8189	r8281
5	5	public abstract class Distance
6	6	{
7		public abstract ~~double get(double[] v1, double[] v2, int size~~);
	7	public abstract float get(float[] v1, float[] v2);
8	8	}
9	9

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/EuclideanDistance.java

r8189	r8281
3	3	public class EuclideanDistance extends Distance
4	4	{
5		public ~~double get(double[] v1, double[] v2, int size~~)
	5	public float get(float[] v1, float[] v2)
6	6	{
7		double sum = 0, x;
8		//System.out.println("Array length "+ v1.length);
9		for(int c=0;c<v1.length;c++)
10		{
11		x = v1[c] - v2[c];
	7	float sum = 0;
	8
	9	for(int c=0;c<v1.length;c++){
	10	float x = v1[c] - v2[c];
12	11	sum += x*x;
13	12	}
14		return Math.sqrt(sum);
	13	return (float)Math.sqrt(sum);
15	14	}
16	15	}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Hierarchical.java

-              r8189
+              r8281
 public class Hierarchical
+{
+    public double[] minrow;
+    public int[]    colind;
+    //public SimMatrix smatrix;
+    public TriangleIndex triangle;
+    public Hierarchical(int matrix_rows)
+    {
+        // create Triangle with "limit" arg = maximum #clusters
+        // an object to help with indexing the cells
+        // in a triangular matrix
+        triangle = new TriangleIndex(matrix_rows);
+    }
+    public float[] minrow;
+    public int[] colind;
+    public TriangleIndex triangle;
+    public Hierarchical(int matrix_rows){
+    // create Triangle with "limit" arg = maximum #clusters
+    // an object to help with indexing the cells
+    // in a triangular matrix
+    triangle = new TriangleIndex(matrix_rows);
+    }
     // set each minrow[i] to the value of the minimum element of that row
     void set_minrow (SimMatrix m, int row)
+    {
         // triangular matrix without diagonal, hence zeroth row is all zero
+    void set_minrow (SimMatrix m, int row)
+    {
+    // triangular matrix without diagonal, hence zeroth row is all zero
         if(row == 0) return;
         int i;
         int index_start, index_end;
+    int i;
+    int index_start, index_end;
         // first column
+        index_start = triangle.index (row, 0);
+        // last column (row(!)-1 because as many rows as columns)
+        index_end   = triangle.index (row, row - 1);
+        minrow[row] = m.matrix[index_start];
+        colind[row] = index_start;
+        for (i = index_start + 1; i <= index_end; i++)
+        {
+            if (m.matrix[i] < minrow[row])
+            {
+                minrow[row] = m.matrix[i];
+                colind[row] = i;
+            }
+        }
+    index_start = triangle.index (row, 0);
+    // last column (row(!)-1 because as many rows as columns)
+    index_end   = triangle.index (row, row - 1);
+    minrow[row] = m.matrix[index_start];
+    colind[row] = index_start;
+    for (i = index_start + 1; i <= index_end; i++){
+        if (m.matrix[i] < minrow[row]){
+        minrow[row] = m.matrix[i];
+        colind[row] = i;
+        }
+    }
+    }
     // return the ROW in which the minimum disimilarity resides
+    int get_minrow (int last_row)
+    {
+        int i;
+        int row;
+        double value;
+        row   = 1;
+        value = minrow[1];
+        for (i = 2; i <= last_row; i++)
+        {
+            if (minrow[i] < value)
+            {
+                row = i;
+                value = minrow[i];
+            }
+    int get_minrow (int last_row){
+    int i;
+    int row;
+    float value;
+    row   = 1;
+    value = minrow[1];
+    for (i = 2; i <= last_row; i++){
+        if (minrow[i] < value){
+        row = i;
+        value = minrow[i];
+        }
+        }
+        return row;
+    return row;
+    }
+    // num_of_rows is "limit" = max docs to be clustered in hierarchical clustering
+    Cluster clustering (SimMatrix sim_matrix, int num_of_rows, double[] heights, LinkFunc linkage_func)
+    {
+    Cluster child_1;
+    Cluster child_2;
+    Cluster parent=null;
+    ClusterArray cluster_ary;
+        int min_row, index, row, col, i;
+    double sim;
+        int last_row_index, sim_matrix_size;
+        sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
+    // Place each of the documents in a cluster of each own
+    // there are vector.length docs but only num_of_rows = limit places in the array
+    // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
+    cluster_ary = new ClusterArray (num_of_rows);
+    last_row_index = num_of_rows - 1;
+    // minimum distance for each row of the sim_matrix
+    minrow = new float[num_of_rows];
+    // column index for minimum distance entry
+    colind = new int[num_of_rows];
+    for (i = 1; i < num_of_rows; i++) set_minrow(sim_matrix, i);
+    // do all the hierarchical clustering by running over all rows, updating the matrix
+    // as a new cluster is formed each time
+    for (i = 0; i < num_of_rows - 1; i++){
+        // find the row with the lowest disimilarity
+        min_row = get_minrow(last_row_index);
+        sim = minrow[min_row];
+        System.out.println("Min sim: " + sim);
+        // the column with the lowest distance in that row
+        index  = colind[min_row];
+        // array of heights, DESCENDING disim, i.e. sim goes last
+        heights[num_of_rows - 2 - i] = sim;
+        row = triangle.i_ind[index];
+        col = triangle.j_ind[index];
+        // merges rows and cols to create a smaller sim_matrix
+        merge (row, col, sim_matrix, last_row_index, linkage_func);
+        System.out.println("Merging: " + row + " and " + col);
+        sim_matrix_size -= last_row_index;
+        child_1 = (Cluster)cluster_ary.contents.elementAt(row);
+        child_2 = (Cluster)cluster_ary.contents.elementAt(col);
+        // parents get id = -1 to indicate that they are not representing single docs
+        // all leaves have an id (1 - num_rows) that points to the doc they contain
+        int sum = child_1.items + child_2.items;
+        System.out.println("Create new parent with " + child_1.items + " + " + child_2.items + " leaves");
+        parent = new Cluster (-1, null, child_1, child_2, sim, child_1.items + child_2.items);
+        child_1.parent = parent;
+        child_2.parent = parent;
+        cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
+        cluster_ary.contents.setElementAt(parent,col);
+        last_row_index--;
+    }
+    // num_of_rows is "limit" = max #clusters
+    Cluster clustering (SimMatrix sim_matrix, int num_of_rows,
+                            double[] heights, LinkFunc linkage_func)
+    {
+        Cluster child_1;
+        Cluster child_2;
+        Cluster parent=null;
+        ClusterArray cluster_ary;
+        int min_row, index, row, col, i;
+        double sim;
+        int last_row_index, sim_matrix_size;
+        sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
+        // Place each of the documents in a cluster of each own
+        // there are vector.length docs but only num_of_rows = limit places in the array
+        // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
+        cluster_ary = new ClusterArray (num_of_rows);
+        last_row_index = num_of_rows - 1;
+        // a variable that is a double and exists for each row - what can this be?
+        // the lowest dissimilarity score?
+        minrow = new double[num_of_rows];
+        // a variable that is an int and exists for each row - what can that be?
+        // the document index of the document with that lowest score?
+        colind = new int[num_of_rows];
+        for (i = 1; i < num_of_rows; i++)
+        {
+            set_minrow(sim_matrix, i);
+        }
+        // do all the hierarchical clustering by running over all rows, updating the matrix
+        // as a new cluster is formed each time
+        for (i = 0; i < num_of_rows - 1; i++)
+        {
+            // find the row with the lowest disimilarity
+            min_row = get_minrow(last_row_index);
+            sim = minrow[min_row];
+            // the column with the lowest dis... (sim) in that row
+            index  = colind[min_row];
+            // array of heights, DESCENDING disim, i.e. sim goes last
+                heights[num_of_rows - 2 - i] = sim;
+                row = triangle.i_ind[index];
+                col = triangle.j_ind[index];
+                merge (row, col, sim_matrix, last_row_index, linkage_func);
+                sim_matrix_size -= last_row_index;
+                child_1 = (Cluster)cluster_ary.contents.elementAt(row);
+                child_2 = (Cluster)cluster_ary.contents.elementAt(col);
+                // parents get id = -1 to indicate that they are not representing single docs
+                // all leaves have an id (1 - num_rows) that points to the doc they contain
+                parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items);
+                child_1._parent = parent;
+                child_2._parent = parent;
+                cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
+                cluster_ary.contents.setElementAt(parent,col);
+                last_row_index--;
+            }
+        return parent;
+        }
+    return parent;
+    }
     // form a new cluster from the two most similar documents
     void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func)
+    {
+  /* i and j are the matrix elements with maximum similarity         */
+  /* matrix is the triangle similarity matrix with last row index n  */
+  /* linkage_func is a pointer to the linkage function               */
+  /* i > j */
+        int c, k, index1, index2;
+    {
+    /* i and j are the matrix elements with maximum similarity         */
+    /* matrix is the triangle similarity matrix with last row index n  */
+    /* linkage_func is a pointer to the linkage function               */
+    /* i > j */
+    int c, k, index1, index2;
         // go over all rows
+        for (k = 0; k <= n; k++)
+        {
+    for (k = 0; k <= n; k++){
             // if k < j the affected elements are on row j
+            if (k < j)
+            {
+                index1 = triangle.index (k, j);
+                index2 = triangle.index (k, i);
+                m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("1 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+        }
+            else if (k > j && k < i)
+        {
+      /* if j < k < i then only one element is updated per row */
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("2 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, m.matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                colind[k] = index1;
+            }
+                }
+        }
+            else if (k > i && k < n)
+        {
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                            colind[k] = index1;
+            }
+                }
+            index1 = triangle.index (k, i);
+            index2 = triangle.index (k, n);
+            m.matrix[index1] = m.matrix[index2];
+      //printf (" copy [%d, %d] to [%d, %d] = %f\n", k, n, k, i, matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                colind[k] = index1;
+            }
+                }
+        }
+                else if (k == n  && i != n)
+        {
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+            //printf ("5 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+            if (index1 == colind[k])
+                {
+                /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+                {
+                    minrow[k] = m.matrix[index1];
+                    colind[k] = index1;
+                }
+                }
+            for (c = 0; c < i; c++)
+                {
+                index1 = triangle.index (c, i);
+                index2 = triangle.index (c, n);
+                m.matrix[index1] = m.matrix[index2];
+                //printf ("copy [%d, %d] to [%d, %d] = %f\n", c, n, c, i, matrix[index2]);
+                }
+        }
+        if (k < j){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        }else if (k > j && k < i){
+        /* if j < k < i then only one element is updated per row */
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        }else if (k > i && k < n){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        index1 = triangle.index (k, i);
+        index2 = triangle.index (k, n);
+        m.matrix[index1] = m.matrix[index2];
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        }else if(k == n  && i != n){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        for (c = 0; c < i; c++){
+            index1 = triangle.index (c, i);
+            index2 = triangle.index (c, n);
+            m.matrix[index1] = m.matrix[index2];
+        }
+        }
+        }
+        set_minrow(m, i);
+        set_minrow(m, j);
+    }
+    set_minrow(m, i);
+    set_minrow(m, j);
+    }
+}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Iterative.java

-              r8189
+              r8281
+{
     Vector vector;
     int VECTOR_SIZE;
+    int wordCount;
     Iterative(int vsize)
+    {
         VECTOR_SIZE = vsize;
+    wordCount = vsize;
+    }
     public void printVector(double[] a)
+    {
         int i;
         for (i = 0; i < VECTOR_SIZE; i++)
+    for (i = 0; i < wordCount; i++)
             System.out.println("v[" + i + "] = " + a[i]);
+    }
+    void cluster_center (Cluster cluster, double [][] vectors, double[] centroid)
+    void cluster_center (Cluster cluster, float [][] vectors, float[] centroid)
+    {
+        int i;
+        if (cluster.id == -1){
+            cluster_center (cluster.child_1, vectors, centroid);
+            cluster_center (cluster.child_2, vectors, centroid);
+        }else{
+            float[] doubleArray = vectors[cluster.id];
+            for (i = 0; i < doubleArray.length; i++)
+        centroid[i] += doubleArray[i];
+        }
+    }
+    void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, float [][] vectors)
+    {
+    int i, j; double ssum;
+    Cluster clust;
+    /**** for each hierarchical cluster, compute centroid ****/
+    for (i = 0; i < num_of_clusters; i++){
+        clust = (Cluster)clusters.contents.elementAt(i);
+        clust.centroid = new float[wordCount];
+        cluster_center (clust, vectors, clust.centroid);
+        for (ssum = j = 0; j < wordCount; j++)
+        ssum += clust.centroid[j] * clust.centroid[j];
+        if(ssum>0)                // normalise
+        for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < wordCount; j++)
+            clust.centroid[j] *= ssum;
+    }
+    }
+    void place_item (ClusterArray clusters, int num_of_clusters, float [][] vectors, int item_id,  Distance sim_func)
+    {
+        Cluster clust = (Cluster)clusters.contents.elementAt(0);
+    double minDist = sim_func.get(clust.centroid,vectors[item_id]);
+    int x_minDist = 0;
+    for (int i = 1; i < num_of_clusters; i++){
+            clust = (Cluster)clusters.contents.elementAt(i);
+            double dist = sim_func.get(clust.centroid,vectors[item_id]);
+            if (dist < minDist){
+        x_minDist = i;
+        minDist = dist;
+        }
+        }
+    //System.out.println("in " + x_minDist);
+    clust = (Cluster)clusters.contents.elementAt(x_minDist);
+        clust.iter_items.addElement(new Integer(item_id));
+    clust.iter_items_num++;
+    }
+    // add all documents not yet added to the existing cluster hierarchy (which is based on the
+    // limit first documents. Hence there are vector.length - limit docs to add
+    // consequently index runs from limit (start_index) to vector.length (end_index)
+    void clustering (ClusterArray clusters, int[] num_of_clusters, float[][] vectors, int start_index, int end_index,Distance sim_func)
+    {
         int i;
+        if (cluster._id == -1)
+        {
+            cluster_center (cluster._child_1, vectors, centroid);
             cluster_center (cluster._child_2, vectors, centroid);
+        }
+        else
+        {
+            double[] doubleArray = vectors[cluster._id];
+            for (i = 0; i < doubleArray.length; i++)
                 centroid[i] += doubleArray[i];
+        }
+    System.out.println("Start: " + start_index + " Finish: " + end_index);
+    if (start_index >= end_index)
+        return;
+    compute_cluster_centers(clusters, num_of_clusters[0], vectors);
+    for (i = start_index; i < end_index; i++){
+        System.out.println("Placing document " + i);
+        place_item(clusters, num_of_clusters[0], vectors, i, sim_func);
+    }
+    }
-    void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, double [][] vectors)
+    {
-        int i, j; double ssum;
-        Cluster clust;
-        for (i = 0; i < num_of_clusters; i++)
+        {
-            clust = (Cluster)clusters.contents.elementAt(i);
-            clust._centroid = new double[VECTOR_SIZE];
-            cluster_center (clust, vectors, clust._centroid);
-            for (ssum = j = 0; j < VECTOR_SIZE; j++)
-            ssum += clust._centroid[j] * clust._centroid[j];
-            if(ssum>0)                // normalise
-            for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < VECTOR_SIZE; j++)
-            clust._centroid[j] *= ssum;
+        }
+    }
-    void place_item (ClusterArray clusters,
-                int num_of_clusters,
-                double [][] vectors,
-                int item_id,  Distance sim_func)
+    {
-        double max_sim, sim;
-        int max_sim_index;
-        int i;
-        max_sim_index = 0;
-        Cluster clust = (Cluster)clusters.contents.elementAt(0);
-        max_sim = sim_func.get(clust._centroid,
-              vectors[item_id], VECTOR_SIZE);
-        for (i = 1; i < num_of_clusters; i++)
+        {
-            clust = (Cluster)clusters.contents.elementAt(i);
-            sim = sim_func.get(clust._centroid,
-            vectors[item_id], VECTOR_SIZE);
-            if (sim < max_sim)
+            {
-                max_sim_index = i;
-                max_sim = sim;
+            }
+        }
-        clust = (Cluster)clusters.contents.elementAt(max_sim_index);
-        clust._iter_items.addElement(new TheInt(item_id));
-        clust._iter_items_num++;
+    }
-    // add all documents not yet added to the existing cluster hierarchy (which is based on the
-    // limit first documents. Hence there are vector.length - limit docs to add
-    // consequently index runs from limit (start_index) to vector.length (end_index)
-    void clustering (ClusterArray clusters, TheInt num_of_clusters,
-                      double[][] vectors, int start_index, int end_index,
-                      Distance sim_func)
+    {
-        int i;
-        if (start_index >= end_index)
-                return;
-        compute_cluster_centers(clusters, num_of_clusters.val, vectors);
-        for (i = start_index; i < end_index; i++)
-                place_item(clusters, num_of_clusters.val, vectors, i, sim_func);
+    }
+}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/LinkFunc.java

r8189	r8281
5	5	public abstract class LinkFunc
6	6	{
7		abstract ~~double link(double a, double~~ b);
	7	abstract float link(float a, float b);
8	8	}

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MaxLink.java

r8189	r8281
3	3	public class MaxLink extends LinkFunc
4	4	{
5		public ~~double link(double a, double~~ b)
	5	public float link(float a, float b)
6	6	{
7	7	return (a > b? a99+b: b99+a)/100;

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MinLink.java

r8189	r8281
3	3	public class MinLink extends LinkFunc
4	4	{
5		public ~~double link(double a, double~~ b)
	5	public float link(float a, float b)
6	6	{
7	7	return (a < b? a999+b: b999+a)/1000;

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/SimMatrix.java

-              r8189
+              r8281
 import java.util.*;
-// kainof DistanceMatrix
 public class SimMatrix
+{
+  public double[] matrix;
+    public float[] matrix;
+  // "vectors" is the expanded SparseMatrix, i.e. doc x keywords
+  // VECTOR_SIZE = #keywords
+  public SimMatrix(double[][] vectors,
+      int limit, Distance simfunc, int VECTOR_SIZE)
+  {
+    int i,j;
+    double iarray[];
+    // size = #docs
+    int size=vectors.length, index=0;
+    // to exclude diagonal and conjugates
+    // - - - - -
+    //         |
+    // *       |
+    // * *     |
+    // * * *   |
+    int realSize = (size * (size - 1)/2);
+    matrix=new double[realSize];
+    System.out.println("Similarity matrix size: " + size);
+    for(i = 1; i < size; i++)
+    public SimMatrix(float[][] KDMatrix, int limit, Distance simfunc){
+    float iarray[];
+    int hitDocs = KDMatrix.length;
+    int wordCount = KDMatrix[0].length;
+    int index = 0;
+    // to exclude diagonal and conjugates
+    // - - - - -
+    //         |
+    // *       |
+    // * *     |
+    // * * *   |
+    int realSize = (hitDocs * (hitDocs - 1)/2);
+    matrix = new float[realSize];
+    // this gets the sim matrix for all docs rather
+    // than only the first limit - mistake?
+    for(int i = 1; i < hitDocs; i++){
+        iarray = KDMatrix[i];
+        for(int j = 0; j < i; j++){
+        matrix[index] = simfunc.get(iarray,KDMatrix[j]);
+            index++;
+        }
+    }
+    }
+    public void printMatrix()
+    {
+        iarray=vectors[i];
+        for(j = 0; j < i; j++)
+        {
+            matrix[index] = simfunc.get(iarray,
+                            vectors[j],
+                            VECTOR_SIZE);
+            index++;
+        }
+    int countA = 0;
+    int countB = 0;
+    for( int i = 0; i < matrix.length; i++ ){
+        System.out.print(matrix[i] + " ");
+        if( countA >= countB ){
+        System.out.println();
+        countB++;
+        countA = 0;
+        }
+        else countA++;
+    }
+    }
+  }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/AveLink.java

r8189	r8281
3	3	public class AveLink extends LinkFunc
4	4	{
5		~~double link(double a, double~~ b)
	5	float link(float a, float b)
6	6	{
7	7	return (a+b)/2;

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Cluster.java

-              r8189
+              r8281
 public class Cluster
+{
     public int _id;
     public Cluster  _parent = null;
     public Cluster  _child_1 = null;
     public Cluster  _child_2 = null;
     public double _children_sim;
     // probably the size of the cluster
     public int _items;
     public int _split;
+    public int id;
+    public Cluster  parent = null;
+    public Cluster  child_1 = null;
+    public Cluster  child_2 = null;
+    public double children_sim;
+    // size of cluster
+    public int items;
+    public int split;
     // [] -> centroid for each vector element ?
+    public double[] _centroid=null;
+    public Vector _iter_items = null;
+    public int _iter_items_num = 0;
+    public int _x = 0;
+    public int _y = 0;
+    public double[] getCentroid(){return _centroid;}
+    public Cluster(int id, Cluster parent, Cluster child_1, Cluster child_2, double children_sim, int items)
+    {
+        _id                    = id;
+        _children_sim          = children_sim;
+        _items                 = items;
+        _split                 = 0;
+        _parent                = parent;
+        _iter_items            = new Vector(0);
+    public float[] centroid=null;
+    public Vector iter_items = null;
+    public int iter_items_num = 0;
+    public int x = 0;
+    public int y = 0;
+    public float[] getCentroid(){return centroid;}
+    public Cluster(int id, Cluster parent, Cluster c1, Cluster c2, double children_sim, int items)
+    {
+    this.id               = id;
+    this.children_sim     = children_sim;
+    this.items            = items;
+    split                 = 0;
+    this.parent           = parent;
+        iter_items            = new Vector(0);
         // probably to get a balanced tree
+        if (child_1 != null && child_2 != null)
+        {
+            if (child_2._items > child_1._items)
+            {
+                _child_1 = child_1;
+                _child_2 = child_2;
+            }
+            else
+            {
+                _child_1 = child_2;
+                _child_2 = child_1;
+            }
+        }
+        else // if either or both are null
+        {
+            _child_1 = child_1;
+            _child_2 = child_2;
+    if (c1 != null && c2 != null){
+        if (c2.items > c1.items){
+        child_1 = c1;
+        child_2 = c2;
+        }else{
+            child_1 = c2;
+        child_2 = c1;
+        }
+    }
+    }else{ // if either or both are null
+        child_1 = c1;
+        child_2 = c2;
+    }
+    }
     // a nice recursive function to determine the depth of the hierarchy
+    public int depth()
+    {
+        if (_id != -1)
+            return 0;
+        else
+            return Math.max(1 + _child_1.depth(), 1 + _child_2.depth());
+    public int depth()
+    {
+    if (id != -1)
+        return 0;
+    else
+        return Math.max(1 + child_1.depth(), 1 + child_2.depth());
+    }
+    void addChildrenToOutputVector(int []coding, Vector output)
+    {
+    if (id != -1){
+            Integer ii = new Integer(coding[id]);
+            output.addElement(ii);
+    }else{
+            child_1.addChildrenToOutputVector(coding,output);
+            child_2.addChildrenToOutputVector(coding,output);
+    }
-    void addChildrenToOutputVector(int []coding, Vector output)
+    {
-      if (_id != -1)
+          {
-            Integer ii = new Integer(coding[_id]);
-            output.addElement(ii);
+          }
-      else
+          {
-            _child_1.addChildrenToOutputVector(coding,output);
-            _child_2.addChildrenToOutputVector(coding,output);
+          }
+    }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/ClusterArray.java

-              r8189
+              r8281
 public class ClusterArray
+{
+    Vector contents;
+    ClusterArray(int num_of_clusters)
+    {
+        int i;
+        contents = new Vector(num_of_clusters);
+        for (i = 0; i < num_of_clusters; i++)
+             contents.addElement(new Cluster(i, null, null, null, 0, 1));
+             // i.e. id = i, similarity between kids = 0, #items = 1
+    Vector contents;
+    ClusterArray(int num_of_clusters)
+    {
+    contents = new Vector(num_of_clusters);
+    for(int i = 0; i < num_of_clusters; i++)
+        contents.addElement(new Cluster(i, null, null, null, 0, 1));
+    }
+    // counts the number of clusters that are less dissimilar than disim
+    private void sweep_tree (Cluster cluster, double disim, int min_c_items, int[] clusters)
+    {
+    cluster.split = 0;
+    if (cluster.id == -1){
+        if (cluster.children_sim <= disim && cluster.items >= min_c_items){
+        cluster.split = 1;
+        contents.addElement(cluster);
+        clusters[0]++;
+        }else{
+        sweep_tree (cluster.child_1, disim, min_c_items, clusters);
+        sweep_tree (cluster.child_2, disim, min_c_items, clusters);
+        }
+        }else{
+        if (cluster.items >= min_c_items){
+        cluster.split = 1;
+        contents.addElement(cluster);
+        clusters[0]++;
+        }
+    }
+    // counts the number of clusters that are less dissimilar than disim
+    private void sweep_tree (Cluster cluster, double disim, int min_c_items, TheInt clusters)
+    {
+        cluster._split = 0;
+        if (cluster._id == -1)
+        {
+            if (cluster._children_sim <= disim && cluster._items >= min_c_items)
+            {
+                cluster._split = 1;
+                contents.addElement(cluster);
+                clusters.val++;
+            }
+            else
+            {
+                sweep_tree (cluster._child_1, disim, min_c_items, clusters);
+                sweep_tree (cluster._child_2, disim, min_c_items, clusters);
+            }
+        }
+        else
+        {
+            if (cluster._items >= min_c_items)
+            {
+                cluster._split = 1;
+                contents.addElement(cluster);
+                clusters.val++;
+            }
+        }
+    }
+    }
+    double split (Cluster cluster, double[] heights, int n, int target,
+                   int min_c_items, TheInt clusters)
+    {
+  /*
+     Try to split 'cluster', made up of 'n' vectors, into 'target'
+     clusters by incrementing iteratively the mimimum required
+     similarity between the 'cluster\'s' children. The result is an
+     array of clusters, 'cluster_ary' which contains 'clusters'
+     clusters.  Obviously the final number of clusters may not be the
+     same as the desired number of clusters. The function return the
+     threshold similarity which resulted in the clusters being split
+     in 'clusters' clusters.
+  */
+      int i;
+      if (target < 2)
+              target = 2;
+      if (target > n)
+              target = n;
+      // height store minimum similarity values
+      // this is then the iterative incrementing loop
+      for (i = target - 2; i <= n - 1; i++)
+      {
+          clusters.val = 0; // simple class for an integer to
+                            // be able to pass it by reference (below)
+          contents.removeAllElements();
+          sweep_tree (cluster, heights[i], min_c_items, clusters);
+          if (clusters.val >= target)
+                  break;
+      }
+      // if target - 2 > n - 1, i.e. if the previous loop has not been
+      // entered at all
+      if (i > n - 1)
+      {
+            /* ignore the min_c_size requirement */
+            i = target - 1;
+            clusters.val = 0;
+            contents.removeAllElements();
+            sweep_tree (cluster, heights[i], 1, clusters);
+      }
+      return heights[i];
+    double split (Cluster cluster, double[] heights, int n, int target, int min_c_items, int[] clusters)
+    {
+    /*
+      Try to split 'cluster', made up of 'n' vectors, into 'target'
+      clusters by incrementing iteratively the mimimum required
+      similarity between the 'cluster\'s' children. The result is an
+      array of clusters, 'cluster_ary' which contains 'clusters'
+      clusters.  Obviously the final number of clusters may not be the
+      same as the desired number of clusters. The function return the
+      threshold similarity which resulted in the clusters being split
+      in 'clusters' clusters.
+    */
+    int i;
+    if (target < 2) target = 2;
+    if (target > n) target = n;
+    // height store minimum similarity values
+    // this is then the iterative incrementing loop
+    for (i = target - 2; i <= n - 1; i++){
+        clusters[0] = 0;
+        contents.removeAllElements();
+        sweep_tree (cluster, heights[i], min_c_items, clusters);
+        if (clusters[0] >= target) break;
+    }
+    // if target - 2 > n - 1, i.e. if the previous loop has not been
+    // entered at all
+    if (i > n - 1){
+        /* ignore the min_c_size requirement */
+        i = target - 1;
+        clusters[0] = 0;
+        contents.removeAllElements();
+        sweep_tree (cluster, heights[i], 1, clusters);
+    }
+    return heights[i];
+    }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Clustering.java

-              r8189
+              r8281
 package vishnu.cluster;
+/* Related word hierarchical clustering using a given triangle similarity matrix */
+import java.util.*;
-import java.util.*;
 public class Clustering
 …
     public static final int BUCKSHOT = 0;
     public static final int COMPLETE = 1;
     private Vector [] theResult;
     private double [][] theCentroids;
     public Vector [] getTheResults()
+    private Vector [] clusters;
+    private double [][] centroids;
+    public Vector [] getClusters()
+    {
         return theResult;
+        return clusters;
+    }
     public double[][] getTheCentroids()
+    public double[][] getCentroids()
+    {
         return theCentroids;
+        return centroids;
+    }
+    public Clustering (int linkage, int [][]m,
+               int num_of_vectors,
+               int arraySize, int min_c_items,
+               int target_clusters, int limit)
+    public Clustering (int linkage, float[][] KDMatrix, int min_c_items, int target_clusters, int limit)
+    {
         SimMatrix sim_matrix;
     LinkFunc linkage_func=null;
+    // number of keywords
         int VECTOR_SIZE = arraySize;
+        int wordCount = KDMatrix[0].length;
+    int hitDocs = KDMatrix.length;
     Cluster toplevel_cluster;
         ClusterArray splitclusters = null;
     TheInt clusters = new TheInt(0);
+    int[] n_clusters = new int[]{0};
         double threshold;
 …
     double[] heights;
+        double [][] vectors = intToDoubleMatrix(m);
         int [] coding = new int[m.length];
+        int[] coding = new int[hitDocs];
+        // populate array with numbers from 0 to #docs-1
         for (int c=0;c<m.length;c++) coding[c]=c;
+        for (int c = 0; c < hitDocs; c++) coding[c] = c;
+    int i, j;
+    switch (linkage){
+    case COMPLETE_LINKAGE:
+        linkage_func = new MaxLink();
+        break;
+    case AVERAGE_LINKAGE:
+        linkage_func = new AveLink();
+        break;
+    case SINGLE_LINKAGE:
+        linkage_func = new MinLink();
+        break;
+    default:
+        System.exit(0);
+    }
+        System.out.println("target_clusters " + target_clusters + " limit " + limit);
+    switch (linkage)
+        {
+            case COMPLETE_LINKAGE:
+            linkage_func = new MaxLink();
+            break;
+            case AVERAGE_LINKAGE:
+            linkage_func = new AveLink();
+            break;
+            case SINGLE_LINKAGE:
+            linkage_func = new MinLink();
+            break;
+            default:
+            System.out.println("clustering.c: main(): linkage type "+
+                   linkage +" not defined.");
+                System.exit(0);
+        }
+        /**** make sure upper bound does not exceed #docs ****/
+    if(limit > hitDocs) limit = hitDocs;
+    limit = (int)hitDocs/2;
-        long startTime = System.currentTimeMillis();
-        long readTime = System.currentTimeMillis();
-        // if fewer documents than upper cluster limit
-        // I think this is an error, should be vectors.length, viz. #documents
-        // which was never spotted as limit is always smaller
-        // than #keywords
-    if(limit > vectors[0].length) limit = vectors[0].length;
+        // contains distance matrix between documents, built using
+        // the double vector[i][] list of keyword frequencies
+        sim_matrix = new SimMatrix(vectors, limit, sim_func, VECTOR_SIZE);
+        long simMatixTime = System.currentTimeMillis();
+        System.out.println("hierarchical clustering " + limit + "...." );
+        //System.out.flush();
+        /* an array of heights sorted by descending disim */
+        // as many heights as there are docs
+        heights = new double[m.length];//[limit - 1];
+    /**** get similarity matrix of the first limit docs with each other ****/
+        sim_matrix = new SimMatrix(KDMatrix, limit, sim_func);
+    heights = new double[hitDocs];
         Hierarchical hierarchical = new Hierarchical(limit);
+        toplevel_cluster = hierarchical.clustering (sim_matrix, limit, heights,
+                                                    linkage_func);
+    double d[] = toplevel_cluster.getCentroid();
+    if( d != null )
+        System.out.println("First element of centroid: " + d[0]);
+    else
+        System.out.println("Centroid null");
+        toplevel_cluster = hierarchical.clustering(sim_matrix, limit, heights, linkage_func);
+    if( toplevel_cluster == null )
+        return;
+    long hierarchicalTime = System.currentTimeMillis();
+        System.out.println("Clustered items: " + toplevel_cluster.items);
+    Cluster c1 = toplevel_cluster.child_1;
+    Cluster c2 = toplevel_cluster.child_2;
+    System.out.println("Child1: " + c1.items);
+    System.out.println("Child2: " + c2.items);
+    splitclusters = new ClusterArray(0);
+    System.out.println("split into");
+    //System.out.flush();
+        splitclusters=new ClusterArray(0);
+    threshold = splitclusters.split(toplevel_cluster, heights, limit,
+                    target_clusters,
+                    min_c_items, clusters);
+    threshold = splitclusters.split(toplevel_cluster, heights, limit, target_clusters, min_c_items, n_clusters);
+        long splitTime = System.currentTimeMillis();
+        System.out.println(clusters.val + " clusters threshold (" +
+                                              threshold  + ").... ");
+        //System.out.flush();
+        System.out.println(n_clusters[0] + " clusters");
+    System.out.println("Threshold: " + threshold);
+        System.out.println("Target_clusters " + target_clusters);
+    System.out.println("Limit " + limit);
+    System.out.println("#docs: " + hitDocs);
+        Iterative iter = new Iterative(wordCount);
+    iter.clustering(splitclusters, n_clusters, KDMatrix, limit, hitDocs, sim_func);
+    System.out.println("Finished iterative clustering");
+    System.out.println("iterative clustering.............");
+        System.out.println("target_clusters " + target_clusters + " limit " +
+                           limit + " vectors length " + vectors.length);
+    System.out.flush();
+    System.out.println("Cluster Length: " + clusters.val);
+        Iterative iter = new Iterative(VECTOR_SIZE);
+    iter.clustering (splitclusters, clusters, vectors, limit,
+                          vectors.length, sim_func);
+        long iterativeTime = System.currentTimeMillis();
+        clusters = new Vector[n_clusters[0]];
+        centroids = new double[n_clusters[0]][wordCount];
+        theResult = new Vector [clusters.val];
+        theCentroids = new double[clusters.val][arraySize];
+    for (i = 0; i < clusters.val; i++)
+        {
+        theResult[i]=new Vector();
+        Cluster clust = (Cluster)splitclusters.contents.elementAt(i);
+        clust.addChildrenToOutputVector(coding,theResult[i]);
+        for (j = 0; j < clust._iter_items_num; j++)
+            {
+                TheInt ii = (TheInt)clust._iter_items.elementAt(j);
+                Integer jj = new Integer (coding[ii.val]);
+                theResult[i].addElement(jj);
+            }
+            double [] centroid = clust.getCentroid();
+            for (j = 0; j < arraySize; j++)
+            {
+                  theCentroids[i][j]=centroid[j];
+            }
+        }
+        long printTime = System.currentTimeMillis();
+        System.out.println(" Vector size  " + vectors.length + " limit " + limit + " Clusters # " + clusters.val);
+        double printTimeSecs = (printTime-iterativeTime)/1000.0;
+        double iterativeTimeSecs = (iterativeTime - splitTime)/1000.0;
+        double splitTimeSecs = (splitTime - hierarchicalTime)/1000.0;
+        double hierarchicalTimeSecs = (hierarchicalTime - simMatixTime)/1000.0;
+        double simMatixTimeSecs = (simMatixTime - readTime)/1000.0;
+        double readTimeSecs = (readTime - startTime)/1000.0;
+        System.out.println("Timings:");
+        //System.out.println("Read from file: "+ readTimeSecs+" seconds");
+        System.out.println("Create Similarity matrix: "+ simMatixTimeSecs+" seconds");
+        System.out.println("Hierarchical Clustering: "+ hierarchicalTimeSecs+" seconds");
+        System.out.println("Split Clusters: "+ splitTimeSecs+" seconds");
+        System.out.println("Iterative Clustering: "+ iterativeTimeSecs+" seconds");
+        System.out.println("Build results: "+ printTimeSecs+" seconds");
+    for (int i = 0; i < n_clusters[0]; i++){
+        clusters[i]=new Vector();
+        Cluster clust = (Cluster)splitclusters.contents.elementAt(i);
+        clust.addChildrenToOutputVector(coding,clusters[i]);
+        for (int j = 0; j < clust.iter_items_num; j++){
+        Integer ii = (Integer)clust.iter_items.elementAt(j);
+        Integer jj = new Integer (coding[ii.intValue()]);
+        clusters[i].addElement(jj);
+        }
+        float [] centroid = clust.getCentroid();
+        for (int j = 0; j < wordCount; j++){
+        centroids[i][j]=centroid[j];
+        }
+    }
+    }
-    static double[][] intToDoubleMatrix(int [][] m)
+    {
-        int h=m.length;
-        double [][] res = null;
-        if (h > 0)
+        {
-          int w = m[0].length;
-          res = new double[h][w];
-          for(int c=0;c<h;c++)
-            for(int d = 0; d<w; d++)
-              res[c][d]=(double)m[c][d];
+        }
-        return res;
+   }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Distance.java

r8189	r8281
5	5	public abstract class Distance
6	6	{
7		public abstract ~~double get(double[] v1, double[] v2, int size~~);
	7	public abstract float get(float[] v1, float[] v2);
8	8	}
9	9

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/EuclideanDistance.java

r8189	r8281
3	3	public class EuclideanDistance extends Distance
4	4	{
5		public ~~double get(double[] v1, double[] v2, int size~~)
	5	public float get(float[] v1, float[] v2)
6	6	{
7		double sum = 0, x;
8		//System.out.println("Array length "+ v1.length);
9		for(int c=0;c<v1.length;c++)
10		{
11		x = v1[c] - v2[c];
	7	float sum = 0;
	8
	9	for(int c=0;c<v1.length;c++){
	10	float x = v1[c] - v2[c];
12	11	sum += x*x;
13	12	}
14		return Math.sqrt(sum);
	13	return (float)Math.sqrt(sum);
15	14	}
16	15	}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Hierarchical.java

-              r8189
+              r8281
 public class Hierarchical
+{
+    public double[] minrow;
+    public int[]    colind;
+    //public SimMatrix smatrix;
+    public TriangleIndex triangle;
+    public Hierarchical(int matrix_rows)
+    {
+        // create Triangle with "limit" arg = maximum #clusters
+        // an object to help with indexing the cells
+        // in a triangular matrix
+        triangle = new TriangleIndex(matrix_rows);
+    }
+    public float[] minrow;
+    public int[] colind;
+    public TriangleIndex triangle;
+    public Hierarchical(int matrix_rows){
+    // create Triangle with "limit" arg = maximum #clusters
+    // an object to help with indexing the cells
+    // in a triangular matrix
+    triangle = new TriangleIndex(matrix_rows);
+    }
     // set each minrow[i] to the value of the minimum element of that row
     void set_minrow (SimMatrix m, int row)
+    {
         // triangular matrix without diagonal, hence zeroth row is all zero
+    void set_minrow (SimMatrix m, int row)
+    {
+    // triangular matrix without diagonal, hence zeroth row is all zero
         if(row == 0) return;
         int i;
         int index_start, index_end;
+    int i;
+    int index_start, index_end;
         // first column
+        index_start = triangle.index (row, 0);
+        // last column (row(!)-1 because as many rows as columns)
+        index_end   = triangle.index (row, row - 1);
+        minrow[row] = m.matrix[index_start];
+        colind[row] = index_start;
+        for (i = index_start + 1; i <= index_end; i++)
+        {
+            if (m.matrix[i] < minrow[row])
+            {
+                minrow[row] = m.matrix[i];
+                colind[row] = i;
+            }
+        }
+    index_start = triangle.index (row, 0);
+    // last column (row(!)-1 because as many rows as columns)
+    index_end   = triangle.index (row, row - 1);
+    minrow[row] = m.matrix[index_start];
+    colind[row] = index_start;
+    for (i = index_start + 1; i <= index_end; i++){
+        if (m.matrix[i] < minrow[row]){
+        minrow[row] = m.matrix[i];
+        colind[row] = i;
+        }
+    }
+    }
     // return the ROW in which the minimum disimilarity resides
+    int get_minrow (int last_row)
+    {
+        int i;
+        int row;
+        double value;
+        row   = 1;
+        value = minrow[1];
+        for (i = 2; i <= last_row; i++)
+        {
+            if (minrow[i] < value)
+            {
+                row = i;
+                value = minrow[i];
+            }
+    int get_minrow (int last_row){
+    int i;
+    int row;
+    float value;
+    row   = 1;
+    value = minrow[1];
+    for (i = 2; i <= last_row; i++){
+        if (minrow[i] < value){
+        row = i;
+        value = minrow[i];
+        }
+        }
+        return row;
+    return row;
+    }
+    // num_of_rows is "limit" = max docs to be clustered in hierarchical clustering
+    Cluster clustering (SimMatrix sim_matrix, int num_of_rows, double[] heights, LinkFunc linkage_func)
+    {
+    Cluster child_1;
+    Cluster child_2;
+    Cluster parent=null;
+    ClusterArray cluster_ary;
+        int min_row, index, row, col, i;
+    double sim;
+        int last_row_index, sim_matrix_size;
+        sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
+    // Place each of the documents in a cluster of each own
+    // there are vector.length docs but only num_of_rows = limit places in the array
+    // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
+    cluster_ary = new ClusterArray (num_of_rows);
+    last_row_index = num_of_rows - 1;
+    // minimum distance for each row of the sim_matrix
+    minrow = new float[num_of_rows];
+    // column index for minimum distance entry
+    colind = new int[num_of_rows];
+    for (i = 1; i < num_of_rows; i++) set_minrow(sim_matrix, i);
+    // do all the hierarchical clustering by running over all rows, updating the matrix
+    // as a new cluster is formed each time
+    for (i = 0; i < num_of_rows - 1; i++){
+        // find the row with the lowest disimilarity
+        min_row = get_minrow(last_row_index);
+        sim = minrow[min_row];
+        System.out.println("Min sim: " + sim);
+        // the column with the lowest distance in that row
+        index  = colind[min_row];
+        // array of heights, DESCENDING disim, i.e. sim goes last
+        heights[num_of_rows - 2 - i] = sim;
+        row = triangle.i_ind[index];
+        col = triangle.j_ind[index];
+        // merges rows and cols to create a smaller sim_matrix
+        merge (row, col, sim_matrix, last_row_index, linkage_func);
+        System.out.println("Merging: " + row + " and " + col);
+        sim_matrix_size -= last_row_index;
+        child_1 = (Cluster)cluster_ary.contents.elementAt(row);
+        child_2 = (Cluster)cluster_ary.contents.elementAt(col);
+        // parents get id = -1 to indicate that they are not representing single docs
+        // all leaves have an id (1 - num_rows) that points to the doc they contain
+        int sum = child_1.items + child_2.items;
+        System.out.println("Create new parent with " + child_1.items + " + " + child_2.items + " leaves");
+        parent = new Cluster (-1, null, child_1, child_2, sim, child_1.items + child_2.items);
+        child_1.parent = parent;
+        child_2.parent = parent;
+        cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
+        cluster_ary.contents.setElementAt(parent,col);
+        last_row_index--;
+    }
+    // num_of_rows is "limit" = max #clusters
+    Cluster clustering (SimMatrix sim_matrix, int num_of_rows,
+                            double[] heights, LinkFunc linkage_func)
+    {
+        Cluster child_1;
+        Cluster child_2;
+        Cluster parent=null;
+        ClusterArray cluster_ary;
+        int min_row, index, row, col, i;
+        double sim;
+        int last_row_index, sim_matrix_size;
+        sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
+        // Place each of the documents in a cluster of each own
+        // there are vector.length docs but only num_of_rows = limit places in the array
+        // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
+        cluster_ary = new ClusterArray (num_of_rows);
+        last_row_index = num_of_rows - 1;
+        // a variable that is a double and exists for each row - what can this be?
+        // the lowest dissimilarity score?
+        minrow = new double[num_of_rows];
+        // a variable that is an int and exists for each row - what can that be?
+        // the document index of the document with that lowest score?
+        colind = new int[num_of_rows];
+        for (i = 1; i < num_of_rows; i++)
+        {
+            set_minrow(sim_matrix, i);
+        }
+        // do all the hierarchical clustering by running over all rows, updating the matrix
+        // as a new cluster is formed each time
+        for (i = 0; i < num_of_rows - 1; i++)
+        {
+            // find the row with the lowest disimilarity
+            min_row = get_minrow(last_row_index);
+            sim = minrow[min_row];
+            // the column with the lowest dis... (sim) in that row
+            index  = colind[min_row];
+            // array of heights, DESCENDING disim, i.e. sim goes last
+                heights[num_of_rows - 2 - i] = sim;
+                row = triangle.i_ind[index];
+                col = triangle.j_ind[index];
+                merge (row, col, sim_matrix, last_row_index, linkage_func);
+                sim_matrix_size -= last_row_index;
+                child_1 = (Cluster)cluster_ary.contents.elementAt(row);
+                child_2 = (Cluster)cluster_ary.contents.elementAt(col);
+                // parents get id = -1 to indicate that they are not representing single docs
+                // all leaves have an id (1 - num_rows) that points to the doc they contain
+                parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items);
+                child_1._parent = parent;
+                child_2._parent = parent;
+                cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
+                cluster_ary.contents.setElementAt(parent,col);
+                last_row_index--;
+            }
+        return parent;
+        }
+    return parent;
+    }
     // form a new cluster from the two most similar documents
     void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func)
+    {
+  /* i and j are the matrix elements with maximum similarity         */
+  /* matrix is the triangle similarity matrix with last row index n  */
+  /* linkage_func is a pointer to the linkage function               */
+  /* i > j */
+        int c, k, index1, index2;
+    {
+    /* i and j are the matrix elements with maximum similarity         */
+    /* matrix is the triangle similarity matrix with last row index n  */
+    /* linkage_func is a pointer to the linkage function               */
+    /* i > j */
+    int c, k, index1, index2;
         // go over all rows
+        for (k = 0; k <= n; k++)
+        {
+    for (k = 0; k <= n; k++){
             // if k < j the affected elements are on row j
+            if (k < j)
+            {
+                index1 = triangle.index (k, j);
+                index2 = triangle.index (k, i);
+                m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("1 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+        }
+            else if (k > j && k < i)
+        {
+      /* if j < k < i then only one element is updated per row */
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("2 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, m.matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                colind[k] = index1;
+            }
+                }
+        }
+            else if (k > i && k < n)
+        {
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+      //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                            colind[k] = index1;
+            }
+                }
+            index1 = triangle.index (k, i);
+            index2 = triangle.index (k, n);
+            m.matrix[index1] = m.matrix[index2];
+      //printf (" copy [%d, %d] to [%d, %d] = %f\n", k, n, k, i, matrix[index2]);
+            if (index1 == colind[k])
+                {
+          /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+            {
+                minrow[k] = m.matrix[index1];
+                colind[k] = index1;
+            }
+                }
+        }
+                else if (k == n  && i != n)
+        {
+            index1 = triangle.index (k, j);
+            index2 = triangle.index (k, i);
+            m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+            //printf ("5 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+            if (index1 == colind[k])
+                {
+                /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+                set_minrow(m, k);
+                }
+            else
+                {
+          /* if the changed element is smaller than the current minimum update minrow and colind */
+                if (m.matrix[index1] < minrow[k])
+                {
+                    minrow[k] = m.matrix[index1];
+                    colind[k] = index1;
+                }
+                }
+            for (c = 0; c < i; c++)
+                {
+                index1 = triangle.index (c, i);
+                index2 = triangle.index (c, n);
+                m.matrix[index1] = m.matrix[index2];
+                //printf ("copy [%d, %d] to [%d, %d] = %f\n", c, n, c, i, matrix[index2]);
+                }
+        }
+        if (k < j){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        }else if (k > j && k < i){
+        /* if j < k < i then only one element is updated per row */
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        }else if (k > i && k < n){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        //printf ("3 - [%d, %d] = link ([%d, %d], [%d, %d]) = %f\n", k, j, k, i, k, j, matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        index1 = triangle.index (k, i);
+        index2 = triangle.index (k, n);
+        m.matrix[index1] = m.matrix[index2];
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        }else if(k == n  && i != n){
+        index1 = triangle.index (k, j);
+        index2 = triangle.index (k, i);
+        m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
+        if (index1 == colind[k]){
+            /* the previous minimum of the row has been changed so recaculate minimum of whole row */
+            set_minrow(m, k);
+        }else{
+            /* if the changed element is smaller than the current minimum update minrow and colind */
+            if (m.matrix[index1] < minrow[k]){
+            minrow[k] = m.matrix[index1];
+            colind[k] = index1;
+            }
+        }
+        for (c = 0; c < i; c++){
+            index1 = triangle.index (c, i);
+            index2 = triangle.index (c, n);
+            m.matrix[index1] = m.matrix[index2];
+        }
+        }
+        }
+        set_minrow(m, i);
+        set_minrow(m, j);
+    }
+    set_minrow(m, i);
+    set_minrow(m, j);
+    }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Iterative.java

-              r8189
+              r8281
+{
     Vector vector;
     int VECTOR_SIZE;
+    int wordCount;
     Iterative(int vsize)
+    {
         VECTOR_SIZE = vsize;
+    wordCount = vsize;
+    }
     public void printVector(double[] a)
+    {
         int i;
         for (i = 0; i < VECTOR_SIZE; i++)
+    for (i = 0; i < wordCount; i++)
             System.out.println("v[" + i + "] = " + a[i]);
+    }
+    void cluster_center (Cluster cluster, double [][] vectors, double[] centroid)
+    void cluster_center (Cluster cluster, float [][] vectors, float[] centroid)
+    {
+        int i;
+        if (cluster.id == -1){
+            cluster_center (cluster.child_1, vectors, centroid);
+            cluster_center (cluster.child_2, vectors, centroid);
+        }else{
+            float[] doubleArray = vectors[cluster.id];
+            for (i = 0; i < doubleArray.length; i++)
+        centroid[i] += doubleArray[i];
+        }
+    }
+    void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, float [][] vectors)
+    {
+    int i, j; double ssum;
+    Cluster clust;
+    /**** for each hierarchical cluster, compute centroid ****/
+    for (i = 0; i < num_of_clusters; i++){
+        clust = (Cluster)clusters.contents.elementAt(i);
+        clust.centroid = new float[wordCount];
+        cluster_center (clust, vectors, clust.centroid);
+        for (ssum = j = 0; j < wordCount; j++)
+        ssum += clust.centroid[j] * clust.centroid[j];
+        if(ssum>0)                // normalise
+        for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < wordCount; j++)
+            clust.centroid[j] *= ssum;
+    }
+    }
+    void place_item (ClusterArray clusters, int num_of_clusters, float [][] vectors, int item_id,  Distance sim_func)
+    {
+        Cluster clust = (Cluster)clusters.contents.elementAt(0);
+    double minDist = sim_func.get(clust.centroid,vectors[item_id]);
+    int x_minDist = 0;
+    for (int i = 1; i < num_of_clusters; i++){
+            clust = (Cluster)clusters.contents.elementAt(i);
+            double dist = sim_func.get(clust.centroid,vectors[item_id]);
+            if (dist < minDist){
+        x_minDist = i;
+        minDist = dist;
+        }
+        }
+    //System.out.println("in " + x_minDist);
+    clust = (Cluster)clusters.contents.elementAt(x_minDist);
+        clust.iter_items.addElement(new Integer(item_id));
+    clust.iter_items_num++;
+    }
+    // add all documents not yet added to the existing cluster hierarchy (which is based on the
+    // limit first documents. Hence there are vector.length - limit docs to add
+    // consequently index runs from limit (start_index) to vector.length (end_index)
+    void clustering (ClusterArray clusters, int[] num_of_clusters, float[][] vectors, int start_index, int end_index,Distance sim_func)
+    {
         int i;
+        if (cluster._id == -1)
+        {
+            cluster_center (cluster._child_1, vectors, centroid);
             cluster_center (cluster._child_2, vectors, centroid);
+        }
+        else
+        {
+            double[] doubleArray = vectors[cluster._id];
+            for (i = 0; i < doubleArray.length; i++)
                 centroid[i] += doubleArray[i];
+        }
+    System.out.println("Start: " + start_index + " Finish: " + end_index);
+    if (start_index >= end_index)
+        return;
+    compute_cluster_centers(clusters, num_of_clusters[0], vectors);
+    for (i = start_index; i < end_index; i++){
+        System.out.println("Placing document " + i);
+        place_item(clusters, num_of_clusters[0], vectors, i, sim_func);
+    }
+    }
-    void compute_cluster_centers (ClusterArray clusters, int num_of_clusters, double [][] vectors)
+    {
-        int i, j; double ssum;
-        Cluster clust;
-        for (i = 0; i < num_of_clusters; i++)
+        {
-            clust = (Cluster)clusters.contents.elementAt(i);
-            clust._centroid = new double[VECTOR_SIZE];
-            cluster_center (clust, vectors, clust._centroid);
-            for (ssum = j = 0; j < VECTOR_SIZE; j++)
-            ssum += clust._centroid[j] * clust._centroid[j];
-            if(ssum>0)                // normalise
-            for(ssum = 1.0/Math.sqrt(ssum), j = 0; j < VECTOR_SIZE; j++)
-            clust._centroid[j] *= ssum;
+        }
+    }
-    void place_item (ClusterArray clusters,
-                int num_of_clusters,
-                double [][] vectors,
-                int item_id,  Distance sim_func)
+    {
-        double max_sim, sim;
-        int max_sim_index;
-        int i;
-        max_sim_index = 0;
-        Cluster clust = (Cluster)clusters.contents.elementAt(0);
-        max_sim = sim_func.get(clust._centroid,
-              vectors[item_id], VECTOR_SIZE);
-        for (i = 1; i < num_of_clusters; i++)
+        {
-            clust = (Cluster)clusters.contents.elementAt(i);
-            sim = sim_func.get(clust._centroid,
-            vectors[item_id], VECTOR_SIZE);
-            if (sim < max_sim)
+            {
-                max_sim_index = i;
-                max_sim = sim;
+            }
+        }
-        clust = (Cluster)clusters.contents.elementAt(max_sim_index);
-        clust._iter_items.addElement(new TheInt(item_id));
-        clust._iter_items_num++;
+    }
-    // add all documents not yet added to the existing cluster hierarchy (which is based on the
-    // limit first documents. Hence there are vector.length - limit docs to add
-    // consequently index runs from limit (start_index) to vector.length (end_index)
-    void clustering (ClusterArray clusters, TheInt num_of_clusters,
-                      double[][] vectors, int start_index, int end_index,
-                      Distance sim_func)
+    {
-        int i;
-        if (start_index >= end_index)
-                return;
-        compute_cluster_centers(clusters, num_of_clusters.val, vectors);
-        for (i = start_index; i < end_index; i++)
-                place_item(clusters, num_of_clusters.val, vectors, i, sim_func);
+    }
+}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/LinkFunc.java

r8189	r8281
5	5	public abstract class LinkFunc
6	6	{
7		abstract ~~double link(double a, double~~ b);
	7	abstract float link(float a, float b);
8	8	}

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MaxLink.java

r8189	r8281
3	3	public class MaxLink extends LinkFunc
4	4	{
5		public ~~double link(double a, double~~ b)
	5	public float link(float a, float b)
6	6	{
7	7	return (a > b? a99+b: b99+a)/100;

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MinLink.java

r8189	r8281
3	3	public class MinLink extends LinkFunc
4	4	{
5		public ~~double link(double a, double~~ b)
	5	public float link(float a, float b)
6	6	{
7	7	return (a < b? a999+b: b999+a)/1000;

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/SimMatrix.java

-              r8189
+              r8281
 import java.util.*;
-// kainof DistanceMatrix
 public class SimMatrix
+{
+  public double[] matrix;
+    public float[] matrix;
+  // "vectors" is the expanded SparseMatrix, i.e. doc x keywords
+  // VECTOR_SIZE = #keywords
+  public SimMatrix(double[][] vectors,
+      int limit, Distance simfunc, int VECTOR_SIZE)
+  {
+    int i,j;
+    double iarray[];
+    // size = #docs
+    int size=vectors.length, index=0;
+    // to exclude diagonal and conjugates
+    // - - - - -
+    //         |
+    // *       |
+    // * *     |
+    // * * *   |
+    int realSize = (size * (size - 1)/2);
+    matrix=new double[realSize];
+    System.out.println("Similarity matrix size: " + size);
+    for(i = 1; i < size; i++)
+    public SimMatrix(float[][] KDMatrix, int limit, Distance simfunc){
+    float iarray[];
+    int hitDocs = KDMatrix.length;
+    int wordCount = KDMatrix[0].length;
+    int index = 0;
+    // to exclude diagonal and conjugates
+    // - - - - -
+    //         |
+    // *       |
+    // * *     |
+    // * * *   |
+    int realSize = (hitDocs * (hitDocs - 1)/2);
+    matrix = new float[realSize];
+    // this gets the sim matrix for all docs rather
+    // than only the first limit - mistake?
+    for(int i = 1; i < hitDocs; i++){
+        iarray = KDMatrix[i];
+        for(int j = 0; j < i; j++){
+        matrix[index] = simfunc.get(iarray,KDMatrix[j]);
+            index++;
+        }
+    }
+    }
+    public void printMatrix()
+    {
+        iarray=vectors[i];
+        for(j = 0; j < i; j++)
+        {
+            matrix[index] = simfunc.get(iarray,
+                            vectors[j],
+                            VECTOR_SIZE);
+            index++;
+        }
+    int countA = 0;
+    int countB = 0;
+    for( int i = 0; i < matrix.length; i++ ){
+        System.out.print(matrix[i] + " ");
+        if( countA >= countB ){
+        System.out.println();
+        countB++;
+        countA = 0;
+        }
+        else countA++;
+    }
+    }
+  }
+}

Context Navigation

Legend:

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/AveLink.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Cluster.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/ClusterArray.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Clustering.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Distance.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/EuclideanDistance.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Hierarchical.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/Iterative.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/LinkFunc.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MaxLink.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/MinLink.java

trunk/gsdl3/extensions/vishnu/src/vishnu/cluster/SimMatrix.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/AveLink.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Cluster.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/ClusterArray.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Clustering.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Distance.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/EuclideanDistance.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Hierarchical.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/Iterative.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/LinkFunc.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MaxLink.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/MinLink.java

trunk/gsdl3/packages/vishnu/src/vishnu/cluster/SimMatrix.java

Download in other formats: