source: gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBWrapper.java@ 36857

Last change on this file since 36857 was 36857, checked in by davidb, 19 months ago

Service now showing some basic functionality to retrieval songs

File size: 10.7 KB
Line 
1/*
2 * WekaDBWrapper.java
3 * Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.util;
20
21import java.io.*;
22import java.util.Vector;
23import java.util.Collections;
24import java.util.regex.Pattern;
25import java.util.regex.Matcher;
26
27import weka.core.Attribute;
28import weka.core.Instance;
29import weka.core.Instances;
30
31import org.apache.log4j.*;
32
33import org.greenstone.gsdl3.util.WekaFindInstanceKNN;
34
35/** Java wrapper class for access to the Weka
36 * Devised (in the first instance) to operate as: java -jar weka.jar <arg1> <arg2>
37 *
38 * Inspired by MGSearchWrapper.java
39 */
40
41public class WekaDBWrapper
42{
43 /** the query result, filled in by runQuery */
44 protected Vector query_result_;
45
46 protected int offset_ = 100;
47 protected int length_ = 20;
48
49 // Approximate matching not yet utilized
50 protected double radius_;
51
52 protected int max_docs_;
53
54 static Logger logger = Logger.getLogger (org.greenstone.gsdl3.util.WekaDBWrapper.class.getName ());
55
56 public WekaDBWrapper() {
57 query_result_ = null;
58 }
59
60 // query param methods
61
62 /** start point (offset) into the array of feature vectors for a track
63 - 100 by default which equals 10 seconds (assuming 0.1 frame size) */
64 public void setOffset(int offset) {
65 offset_ = offset;
66 }
67
68 /** the number of consecutive frames used in match
69 - 20 by default which equals 2 seconds (assuming 0.1 frame size) */
70 public void setLength(int length) {
71 length_ = length;
72 }
73
74 /** distance used in approximate matching support - default is 50 */
75 public void setRadius(double radius) {
76 radius_ = radius;
77 }
78
79 public void setMaxDocs(int max_docs) {
80 max_docs_ = max_docs;
81 }
82
83 /** returns a string with all the current query param settings */
84 // the following was in MG version, do we need this in WekaDB version? // ****
85 //public String getQueryParams() {}
86
87
88 protected boolean addQueryResult(boolean first_entry, String doc_id,
89 Vector<Double> rankVector, Vector<Integer> offsetVector)
90 {
91
92 if (first_entry) {
93 WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
94 query_result_.add(wekaDB_doc_info);
95 first_entry = false;
96 }
97 else {
98 double rank = rankVector.get(0);
99 int offset = offsetVector.get(0);
100 WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
101
102 query_result_.add(wekaDB_doc_info);
103 }
104
105 return first_entry;
106 }
107
108
109 /** actually carry out the query.
110 Use the set methods to set query results.
111 Writes the result to query_result.
112 * - maintains state between requests as can be slow
113 * base_dir and index_path should join together to provide
114 * the absolute location of the mg index files eg ..../index/dtx/demo
115 * base_dir must end with a file separator (OS dependant)
116 */
117
118
119 public void runQuery(String wekaDB_index_dir, String knn_model_file,
120 String assoc_index_dir, String query_string) {
121
122 // combine index_dir with audiodb fileanem
123
124 String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
125
126 //String full_chr12_filename = assoc_index_dir + File.separatorChar
127 // + query_string + File.separatorChar + "doc.chr12";
128
129 System.err.println("**** full knn model filename = " + full_knn_model_filename);
130
131 // Example returned result from Weka KNN
132 // => first line is the input instance ('filename+segment',Arousal,Valence)
133 // following (indented lines) nearest neighbour matches in same format
134 //
135 // ds_22716_5743-6,-0.549489,-0.118439
136 // ds_22716_5743-6,-0.549489,-0.118439
137 // ds_31008_6550-30,-0.549489,-0.118439
138 // ds_72651_26831-6,-0.549489,-0.118439
139 // ds_26196_9214-18,-0.549489,-0.118439
140
141
142 WekaFindInstanceKNN.init(full_knn_model_filename);
143
144 //Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours("ds_22716_5743",6,-0.549489,-0.118439,3);
145 String doc_id = query_string;
146
147 int segment = 6;
148 //String segment_str = Integer.toString(segment);
149 String doc_id_segment = doc_id +"-" + segment;
150
151 double arousal_val = -0.549489;
152 double valence_val = -0.118439;
153 int k_nearest_num = 3;
154
155 System.err.println("doc_id_segment = " + doc_id_segment);
156
157
158 Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours(doc_id_segment,arousal_val,valence_val,k_nearest_num);
159
160
161 query_result_ = new Vector();
162
163 /*
164 WekaDBDocInfo wekaDB_doc_info1 = new WekaDBDocInfo("ds_20415_2337",0.9,4);
165 query_result_.add(wekaDB_doc_info1);
166
167 WekaDBDocInfo wekaDB_doc_info2 = new WekaDBDocInfo("ds_51017_15513",0.87,1);
168 query_result_.add(wekaDB_doc_info2);
169
170 WekaDBDocInfo wekaDB_doc_info3 = new WekaDBDocInfo("ds_20415_2337",0.82,6);
171 query_result_.add(wekaDB_doc_info3);
172 */
173
174 int clamped_k_nearest_num = Math.min(k_nearest_num,nearest_instances.size());
175
176 for (int i=0; i<clamped_k_nearest_num; i++) {
177 Instance instance = nearest_instances.instance(i);
178 logger.info("\tProcessing returned instance: " + instance);
179
180 //Attribute doc_id_segment_att = instance.attribute(0);
181 //String matching_doc_id_segment = instance.attribute(0).value(0);
182 String matching_doc_id_segment = instance.stringValue(0);
183 //double arousal_val = instance.attribute(1);
184 //double valence_val = instance.attribute(2);
185
186
187 Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
188 Matcher m = p.matcher(matching_doc_id_segment);
189 if (m.matches()) {
190
191 String matching_doc_id = m.group(1);
192 int matching_segment_offset = Integer.parseInt(m.group(2));
193
194 double matching_rank = 0.9;
195
196 logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
197 WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
198 query_result_.add(wekaDB_doc_info);
199 }
200 else {
201 logger.error("Returned AV k-nearest neighbour match '"+doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
202 }
203 }
204
205 /*
206 int num_matches_within_track = 6;
207
208
209 first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
210 // and now reset vectors to empty to be ready for next chain of values
211 rankVector = new Vector<Double>();
212 offsetVector = new Vector<Integer>();
213
214
215 rankVector.add(rank);
216 offsetVector.add(target_frame);
217
218 addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
219 */
220
221 }
222
223 public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
224 String assoc_index_dir, String query_string) {
225
226 // combine index_dir with audiodb fileanem
227
228 String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
229 String full_chr12_filename = assoc_index_dir + File.separatorChar
230 + query_string + File.separatorChar + "doc.chr12";
231
232 int num_matches_within_track = 6;
233
234 // ****
235 String [] cmd_array = new String[] {
236 "java", "-jar", "weka.jar",
237 "-d", full_knn_model_filename,
238 "-Q", "nsequence",
239 "-p", String.format("%d",offset_),
240 "-n", String.format("%d",num_matches_within_track),
241 "-l", String.format("%d",length_),
242 "-r", String.format("%d",max_docs_),
243 "-f", full_chr12_filename
244 };
245
246 System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
247
248 Runtime runtime = Runtime.getRuntime();
249 try {
250 Process wekaDB_proc = runtime.exec(cmd_array);
251 //int exitVal = wekaDB_proc.waitFor();
252 //System.err.println("*** exit status = " + exitVal);
253
254 InputStream wis = wekaDB_proc.getInputStream();
255 InputStreamReader wisr = new InputStreamReader(wis);
256 BufferedReader wbr = new BufferedReader(wisr);
257
258 query_result_ = new Vector();
259
260 boolean first_entry = true;
261 int line_count = 0;
262
263 String root_doc_id = null;
264 Vector<Double> rankVector = new Vector<Double>();
265 Vector<Integer> offsetVector = new Vector<Integer>();
266
267 // Example output
268 // D8 0.00105175
269 // 1.69786e-16 392 392
270 // 0.00113568 392 673
271 // 0.00127239 392 910
272 // 0.00139736 392 481
273 // 0.00145331 392 303
274 // D2 0.00429758
275 // 0.00403335 392 865
276 // 0.00411288 392 458
277 // 0.00442461 392 866
278 // 0.00444272 392 864
279 // 0.00447434 392 424
280 // ...
281
282 String line;
283 while ((line = wbr.readLine()) != null) {
284 String[] tokens = line.split("\\s+");
285 line_count++;
286
287 if (tokens.length==2) {
288 // processing a top-level doc line
289
290 if (line_count>1) {
291 // struck new top-level entry => store vector vals for previous block
292
293 first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
294 // and now reset vectors to empty to be ready for next chain of values
295 rankVector = new Vector<Double>();
296 offsetVector = new Vector<Integer>();
297 }
298
299 root_doc_id = tokens[0];
300 }
301 else {
302 // should be 3 items
303 double euclidean_dist = Double.parseDouble(tokens[0]);
304 int src_frame = Integer.parseInt(tokens[1]);
305 int target_frame = Integer.parseInt(tokens[2]);
306
307 // ****
308
309 // enforce 1.0 as upper limit due to rounding errors
310 // in audioDB distance calculations
311 double rank = Math.min(1.0 - euclidean_dist,1.0);
312
313 if ((line_count==2) && (src_frame==target_frame)) {
314 // Found match with self
315 continue;
316 }
317
318 rankVector.add(rank);
319 offsetVector.add(target_frame);
320 }
321
322 }
323
324 addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
325
326 wbr.close();
327
328 // sort query_result_ on 'rank' field
329 // note: compareTo() method impelemented to sort into descending order
330
331 Collections.sort(query_result_);
332
333
334 }
335 catch (Exception e) {
336 logger.error("Failed to execute the following command: " + String.join(" ", cmd_array));
337 e.printStackTrace();
338 }
339
340 }
341
342
343 /** get the result out of the wrapper */
344 public Vector getQueryResult()
345 {
346 return query_result_;
347 }
348}
349
Note: See TracBrowser for help on using the repository browser.