source: trunk/greenstone3-extensions/vishnu/src/vishnu/testvis/dendro/Hierarchical.java@ 8189

Last change on this file since 8189 was 8189, checked in by kjdon, 20 years ago

first version of Imperial College's Visualiser code

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1package vishnu.testvis.dendro;
2
3import vishnu.cluster.*;
4import java.util.*;
5import vishnu.testvis.object.*;
6
7public class Hierarchical
8{
9 public double[] minrow;
10 public int[] colind;
11 //public SimMatrix smatrix;
12 public TriangleIndex triangle;
13 public final double weight = 3.2;
14 public DataManager dataManager;
15
16 public Hierarchical(int matrix_rows)
17 {
18 // create Triangle with "limit" arg = maximum #clusters
19 // an object to help with indexing the cells
20 // in a triangular matrix
21 triangle = new TriangleIndex(matrix_rows);
22
23 }
24
25 // set each minrow[i] to the value of the minimum element of that row
26 void set_minrow (SimMatrix m, int row)
27 {
28 // triangular matrix without diagonal, hence zeroth row is all zero
29 if(row == 0) return;
30
31 int i;
32 int index_start, index_end;
33
34 // first column
35 index_start = triangle.index (row, 0);
36 // last column (row(!)-1 because as many rows as columns)
37 index_end = triangle.index (row, row - 1);
38
39 minrow[row] = m.matrix[index_start];
40 colind[row] = index_start;
41
42 for (i = index_start + 1; i <= index_end; i++)
43 {
44 if (m.matrix[i] < minrow[row])
45 {
46 minrow[row] = m.matrix[i];
47 colind[row] = i;
48 }
49 }
50 }
51
52 // return the ROW in which the minimum disimilarity resides
53 int get_minrow (int last_row)
54 {
55 int i;
56 int row;
57 double value;
58
59 row = 1;
60 value = minrow[1];
61
62 for (i = 2; i <= last_row; i++)
63 {
64 if (minrow[i] < value)
65 {
66 row = i;
67 value = minrow[i];
68 }
69 }
70
71 return row;
72 }
73
74 // num_of_rows is number of documents, or subsample for Buckshot
75 // indices is a vector of the doc ids (made necessary by Buckshot, which uses a subsample of all docs)
76 Cluster clustering (DataManager dm, SimMatrix sim_matrix, int num_of_rows,
77 double[] heights, LinkFunc linkage_func, Vector indices)
78 {
79 Cluster child_1;
80 Cluster child_2;
81 Cluster parent=null;
82 ClusterArray cluster_ary;
83
84
85 int min_row, index, row, col, i;
86 double sim;
87
88 int last_row_index, sim_matrix_size;
89
90 sim_matrix_size = num_of_rows *(num_of_rows - 1) / 2;
91
92 // Place each of the documents in a cluster of each own
93 // there are vector.length docs but only num_of_rows = limit places in the array
94 // may be two-stage clustering process, i.e. cluster subset of docs and then add the rest
95 // cluster_ary = new ClusterArray (dm, num_of_rows);
96
97 cluster_ary = new ClusterArray(dm,indices);
98 last_row_index = num_of_rows - 1;
99
100 // a variable that is a double and exists for each row - what can this be?
101 // the lowest dissimilarity score?
102 minrow = new double[num_of_rows];
103
104 // a variable that is an int and exists for each row - what can that be?
105 // the document index of the document with that lowest score?
106 colind = new int[num_of_rows];
107
108 for (i = 1; i < num_of_rows; i++)
109 {
110 set_minrow(sim_matrix, i);
111 }
112
113 // do all the hierarchical clustering by running over all rows, updating the matrix
114 // as a new cluster is formed each time
115 for (i = 0; i < num_of_rows - 1; i++)
116 {
117 // find the row with the lowest disimilarity
118 min_row = get_minrow(last_row_index);
119 sim = minrow[min_row];
120
121 // the column with the lowest dis... (sim) in that row
122 index = colind[min_row];
123
124 // array of heights, DESCENDING disim, i.e. sim goes last
125 heights[num_of_rows - 2 - i] = sim;
126
127 row = triangle.i_ind[index];
128 col = triangle.j_ind[index];
129
130 newMerge(row,col,sim_matrix,last_row_index,linkage_func,cluster_ary);
131
132 sim_matrix_size -= last_row_index;
133
134 child_1 = (Cluster)cluster_ary.contents.elementAt(row);
135 child_2 = (Cluster)cluster_ary.contents.elementAt(col);
136
137 parent = new Cluster (-1, null, child_1, child_2, sim, child_1._items + child_2._items);
138 Vector vDoc = new Vector();
139 vDoc.addAll(child_1.getDocVect());
140 vDoc.addAll(child_2.getDocVect());
141 parent.setDocVect(vDoc);
142
143 Vector vDocObj = new Vector();
144 vDocObj.addAll(child_1.getDocObjVect());
145 vDocObj.addAll(child_2.getDocObjVect());
146 parent.setDocObjVect(vDocObj);
147
148 int[] wordFreq1 = child_1.getWordFreq();
149 int[] wordFreq2 = child_2.getWordFreq();
150 for( int z = 0; z < wordFreq1.length; z++ )
151 {
152 wordFreq1[z] += wordFreq2[z];
153 }
154 parent.setWordFreq(wordFreq1);
155
156 child_1._parent = parent;
157 child_2._parent = parent;
158 cluster_ary.contents.setElementAt(cluster_ary.contents.elementAt(last_row_index),row);
159 cluster_ary.contents.setElementAt(parent,col);
160 last_row_index--;
161
162 }
163
164 return parent;
165
166 }
167
168 void newMerge(int r, int c, SimMatrix m, int lastRowIndex, LinkFunc linkage_func, ClusterArray clusterArr)
169 {
170 int n = lastRowIndex+1;
171 double newMatrix[][] = new double[n][n];
172 for( int i = 0; i < n; i++ )
173 {
174 for( int j = 0; j < n; j++ )
175 {
176 if( i != j ) newMatrix[i][j] = (double)((int)(m.matrix[triangle.index(i,j)]*100))/100;
177 else newMatrix[i][j] = 0;
178 }
179 }
180
181 for( int i = 0; i < n; i++ )
182 {
183 int sizeA =((Cluster)clusterArr.contents.elementAt(c))._items + ((Cluster)clusterArr.contents.elementAt(r))._items;
184 int sizeB = ((Cluster)clusterArr.contents.elementAt(i))._items;
185 // large if cluster differ in size
186 double scale = weight * (double)((sizeA+sizeB)/2*(sizeA+sizeB)/2)/(sizeA*sizeA+sizeB*sizeB);
187
188 newMatrix[c][i] = scale*linkage_func.link(newMatrix[c][i],newMatrix[r][i]);
189
190 }
191
192 newMatrix[c][c] = 0;
193 newMatrix[c][r] = newMatrix[c][n-1];
194 for( int i = 0; i < n; i++ )
195 {
196 newMatrix[i][c] = newMatrix[c][i];
197 }
198
199
200 for( int i = 0; i < n; i++ )
201 {
202 if( i != c ) newMatrix[r][i] = newMatrix[n-1][i];
203 }
204 newMatrix[r][r] = 0;
205 for( int i = 0; i < n; i++ )
206 {
207 newMatrix[i][r] = newMatrix[r][i];
208 }
209
210 int index = 0;
211 for( int i = 0; i < n; i++ )
212 {
213 for( int j = 0; j < n; j++ )
214 {
215 if( j < i )
216 {
217 m.matrix[index] = newMatrix[i][j];
218 index++;
219 }
220
221 }
222
223 }
224
225 for (int i = 1; i < n; i++)
226 {
227 set_minrow(m, i);
228 }
229
230
231 }
232
233 // form a new cluster from the two most similar documents
234 void merge (int i, int j, SimMatrix m, int n, LinkFunc linkage_func)
235 {
236 /* i and j are the matrix elements with maximum similarity */
237 /* matrix is the triangle similarity matrix with last row index n */
238 /* linkage_func is a pointer to the linkage function */
239
240 /* i > j */
241
242 int c, k, index1, index2;
243
244 // go over all rows
245 for (k = 0; k <= n; k++)
246 {
247 // if k < j the affected elements are on row j
248 if (k < j)
249 {
250 // index1 is the smaller of the two, as j < i
251 index1 = triangle.index (k, j);
252 index2 = triangle.index (k, i);
253 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
254 }
255 else if (k > j && k < i)
256 {
257 // if j < k < i then only one element is updated per row
258 index1 = triangle.index (k, j);
259 index2 = triangle.index (k, i);
260 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
261 if (index1 == colind[k])
262 {
263 // the previous minimum of the row has been changed so recaculate minimum of whole row
264 set_minrow(m, k);
265 }
266 else
267 {
268 // if the changed element is smaller than the current minimum update minrow and colind
269 if (m.matrix[index1] < minrow[k])
270 {
271 minrow[k] = m.matrix[index1];
272 colind[k] = index1;
273 }
274 }
275 }
276 else if (k > i && k < n)
277 {
278 index1 = triangle.index (k, j);
279 index2 = triangle.index (k, i);
280 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
281
282 if (index1 == colind[k])
283 {
284 // the previous minimum of the row has been changed so recaculate minimum of whole row */
285 set_minrow(m, k);
286 }
287 else
288 {
289 // if the changed element is smaller than the current minimum update minrow and colind */
290 if (m.matrix[index1] < minrow[k])
291 {
292 minrow[k] = m.matrix[index1];
293 colind[k] = index1;
294 }
295 }
296
297 index1 = triangle.index (k, i);
298 index2 = triangle.index (k, n);
299 m.matrix[index1] = m.matrix[index2];
300 if (index1 == colind[k])
301 {
302 // the previous minimum of the row has been changed so recaculate minimum of whole row
303 set_minrow(m, k);
304 }
305 else
306 {
307 // if the changed element is smaller than the current minimum update minrow and colind */
308 if (m.matrix[index1] < minrow[k])
309 {
310 minrow[k] = m.matrix[index1];
311 colind[k] = index1;
312 }
313 }
314 }
315 else if (k == n && i != n)
316 {
317 index1 = triangle.index (k, j);
318 index2 = triangle.index (k, i);
319 m.matrix[index1] = linkage_func.link(m.matrix[index1], m.matrix[index2]);
320 if (index1 == colind[k])
321 {
322 // the previous minimum of the row has been changed so recaculate minimum of whole row */
323 set_minrow(m, k);
324 }
325 else
326 {
327 // if the changed element is smaller than the current minimum update minrow and colind */
328 if (m.matrix[index1] < minrow[k])
329 {
330 minrow[k] = m.matrix[index1];
331 colind[k] = index1;
332 }
333 }
334
335 for (c = 0; c < i; c++)
336 {
337 index1 = triangle.index (c, i);
338 index2 = triangle.index (c, n);
339 m.matrix[index1] = m.matrix[index2];
340 }
341 }
342
343 }
344 set_minrow(m, i);
345 set_minrow(m, j);
346 }
347
348}
349
350
351
352
353
354
355
356
Note: See TracBrowser for help on using the repository browser.