1 | /*
|
---|
2 | * WekaDBWrapper.java
|
---|
3 | * Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
|
---|
4 | *
|
---|
5 | * This program is free software; you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
18 | */
|
---|
19 | package org.greenstone.gsdl3.util;
|
---|
20 |
|
---|
21 | import java.io.*;
|
---|
22 | import java.util.Vector;
|
---|
23 | import java.util.Collections;
|
---|
24 | import java.util.regex.Pattern;
|
---|
25 | import java.util.regex.Matcher;
|
---|
26 |
|
---|
27 | import weka.core.Attribute;
|
---|
28 | import weka.core.Instance;
|
---|
29 | import weka.core.Instances;
|
---|
30 |
|
---|
31 | import org.apache.log4j.*;
|
---|
32 |
|
---|
33 | import org.greenstone.gsdl3.util.WekaFindInstanceKNN;
|
---|
34 |
|
---|
35 | /** Java wrapper class for access to the Weka
|
---|
36 | * Devised (in the first instance) to operate as: java -jar weka.jar <arg1> <arg2>
|
---|
37 | *
|
---|
38 | * Inspired by MGSearchWrapper.java
|
---|
39 | */
|
---|
40 |
|
---|
41 | public class WekaDBWrapper
|
---|
42 | {
|
---|
43 | /** the query result, filled in by runQuery */
|
---|
44 | protected Vector query_result_;
|
---|
45 |
|
---|
46 | protected int offset_ = 100;
|
---|
47 | protected int length_ = 20;
|
---|
48 |
|
---|
49 | // Approximate matching not yet utilized
|
---|
50 | protected double radius_;
|
---|
51 |
|
---|
52 | protected int max_docs_;
|
---|
53 |
|
---|
54 | static Logger logger = Logger.getLogger (org.greenstone.gsdl3.util.WekaDBWrapper.class.getName ());
|
---|
55 |
|
---|
56 | public WekaDBWrapper() {
|
---|
57 | query_result_ = null;
|
---|
58 | }
|
---|
59 |
|
---|
60 | // query param methods
|
---|
61 |
|
---|
62 | /** start point (offset) into the array of feature vectors for a track
|
---|
63 | - 100 by default which equals 10 seconds (assuming 0.1 frame size) */
|
---|
64 | public void setOffset(int offset) {
|
---|
65 | offset_ = offset;
|
---|
66 | }
|
---|
67 |
|
---|
68 | /** the number of consecutive frames used in match
|
---|
69 | - 20 by default which equals 2 seconds (assuming 0.1 frame size) */
|
---|
70 | public void setLength(int length) {
|
---|
71 | length_ = length;
|
---|
72 | }
|
---|
73 |
|
---|
74 | /** distance used in approximate matching support - default is 50 */
|
---|
75 | public void setRadius(double radius) {
|
---|
76 | radius_ = radius;
|
---|
77 | }
|
---|
78 |
|
---|
79 | public void setMaxDocs(int max_docs) {
|
---|
80 | max_docs_ = max_docs;
|
---|
81 | }
|
---|
82 |
|
---|
83 | /** returns a string with all the current query param settings */
|
---|
84 | // the following was in MG version, do we need this in WekaDB version? // ****
|
---|
85 | //public String getQueryParams() {}
|
---|
86 |
|
---|
87 |
|
---|
88 | protected boolean addQueryResult(boolean first_entry, String doc_id,
|
---|
89 | Vector<Double> rankVector, Vector<Integer> offsetVector)
|
---|
90 | {
|
---|
91 |
|
---|
92 | if (first_entry) {
|
---|
93 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
|
---|
94 | query_result_.add(wekaDB_doc_info);
|
---|
95 | first_entry = false;
|
---|
96 | }
|
---|
97 | else {
|
---|
98 | double rank = rankVector.get(0);
|
---|
99 | int offset = offsetVector.get(0);
|
---|
100 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
|
---|
101 |
|
---|
102 | query_result_.add(wekaDB_doc_info);
|
---|
103 | }
|
---|
104 |
|
---|
105 | return first_entry;
|
---|
106 | }
|
---|
107 |
|
---|
108 |
|
---|
109 | /** actually carry out the query.
|
---|
110 | Use the set methods to set query results.
|
---|
111 | Writes the result to query_result.
|
---|
112 | * - maintains state between requests as can be slow
|
---|
113 | * base_dir and index_path should join together to provide
|
---|
114 | * the absolute location of the mg index files eg ..../index/dtx/demo
|
---|
115 | * base_dir must end with a file separator (OS dependant)
|
---|
116 | */
|
---|
117 |
|
---|
118 |
|
---|
119 | public void runQuery(String wekaDB_index_dir, String knn_model_file,
|
---|
120 | String assoc_index_dir, String query_string) {
|
---|
121 |
|
---|
122 | // combine index_dir with audiodb fileanem
|
---|
123 |
|
---|
124 | String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
|
---|
125 |
|
---|
126 | //String full_chr12_filename = assoc_index_dir + File.separatorChar
|
---|
127 | // + query_string + File.separatorChar + "doc.chr12";
|
---|
128 |
|
---|
129 | System.err.println("**** full knn model filename = " + full_knn_model_filename);
|
---|
130 |
|
---|
131 | // Example returned result from Weka KNN
|
---|
132 | // => first line is the input instance ('filename+segment',Arousal,Valence)
|
---|
133 | // following (indented lines) nearest neighbour matches in same format
|
---|
134 | //
|
---|
135 | // ds_22716_5743-6,-0.549489,-0.118439
|
---|
136 | // ds_22716_5743-6,-0.549489,-0.118439
|
---|
137 | // ds_31008_6550-30,-0.549489,-0.118439
|
---|
138 | // ds_72651_26831-6,-0.549489,-0.118439
|
---|
139 | // ds_26196_9214-18,-0.549489,-0.118439
|
---|
140 |
|
---|
141 |
|
---|
142 | WekaFindInstanceKNN.init(full_knn_model_filename);
|
---|
143 |
|
---|
144 | //Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours("ds_22716_5743",6,-0.549489,-0.118439,3);
|
---|
145 | String doc_id = query_string;
|
---|
146 |
|
---|
147 | int segment = 6;
|
---|
148 | //String segment_str = Integer.toString(segment);
|
---|
149 | String doc_id_segment = doc_id +"-" + segment;
|
---|
150 |
|
---|
151 | double arousal_val = -0.549489;
|
---|
152 | double valence_val = -0.118439;
|
---|
153 | int k_nearest_num = 3;
|
---|
154 |
|
---|
155 | System.err.println("doc_id_segment = " + doc_id_segment);
|
---|
156 |
|
---|
157 |
|
---|
158 | Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours(doc_id_segment,arousal_val,valence_val,k_nearest_num);
|
---|
159 |
|
---|
160 |
|
---|
161 | query_result_ = new Vector();
|
---|
162 |
|
---|
163 | /*
|
---|
164 | WekaDBDocInfo wekaDB_doc_info1 = new WekaDBDocInfo("ds_20415_2337",0.9,4);
|
---|
165 | query_result_.add(wekaDB_doc_info1);
|
---|
166 |
|
---|
167 | WekaDBDocInfo wekaDB_doc_info2 = new WekaDBDocInfo("ds_51017_15513",0.87,1);
|
---|
168 | query_result_.add(wekaDB_doc_info2);
|
---|
169 |
|
---|
170 | WekaDBDocInfo wekaDB_doc_info3 = new WekaDBDocInfo("ds_20415_2337",0.82,6);
|
---|
171 | query_result_.add(wekaDB_doc_info3);
|
---|
172 | */
|
---|
173 |
|
---|
174 | int clamped_k_nearest_num = Math.min(k_nearest_num,nearest_instances.size());
|
---|
175 |
|
---|
176 | for (int i=0; i<clamped_k_nearest_num; i++) {
|
---|
177 | Instance instance = nearest_instances.instance(i);
|
---|
178 | logger.info("\tProcessing returned instance: " + instance);
|
---|
179 |
|
---|
180 | //Attribute doc_id_segment_att = instance.attribute(0);
|
---|
181 | //String matching_doc_id_segment = instance.attribute(0).value(0);
|
---|
182 | String matching_doc_id_segment = instance.stringValue(0);
|
---|
183 | //double arousal_val = instance.attribute(1);
|
---|
184 | //double valence_val = instance.attribute(2);
|
---|
185 |
|
---|
186 |
|
---|
187 | Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
|
---|
188 | Matcher m = p.matcher(matching_doc_id_segment);
|
---|
189 | if (m.matches()) {
|
---|
190 |
|
---|
191 | String matching_doc_id = m.group(1);
|
---|
192 | int matching_segment_offset = Integer.parseInt(m.group(2));
|
---|
193 |
|
---|
194 | double matching_rank = 0.9;
|
---|
195 |
|
---|
196 | logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
|
---|
197 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
|
---|
198 | query_result_.add(wekaDB_doc_info);
|
---|
199 | }
|
---|
200 | else {
|
---|
201 | logger.error("Returned AV k-nearest neighbour match '"+doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
|
---|
202 | }
|
---|
203 | }
|
---|
204 |
|
---|
205 | /*
|
---|
206 | int num_matches_within_track = 6;
|
---|
207 |
|
---|
208 |
|
---|
209 | first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
210 | // and now reset vectors to empty to be ready for next chain of values
|
---|
211 | rankVector = new Vector<Double>();
|
---|
212 | offsetVector = new Vector<Integer>();
|
---|
213 |
|
---|
214 |
|
---|
215 | rankVector.add(rank);
|
---|
216 | offsetVector.add(target_frame);
|
---|
217 |
|
---|
218 | addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
219 | */
|
---|
220 |
|
---|
221 | }
|
---|
222 |
|
---|
223 | public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
|
---|
224 | String assoc_index_dir, String query_string) {
|
---|
225 |
|
---|
226 | // combine index_dir with audiodb fileanem
|
---|
227 |
|
---|
228 | String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
|
---|
229 | String full_chr12_filename = assoc_index_dir + File.separatorChar
|
---|
230 | + query_string + File.separatorChar + "doc.chr12";
|
---|
231 |
|
---|
232 | int num_matches_within_track = 6;
|
---|
233 |
|
---|
234 | // ****
|
---|
235 | String [] cmd_array = new String[] {
|
---|
236 | "java", "-jar", "weka.jar",
|
---|
237 | "-d", full_knn_model_filename,
|
---|
238 | "-Q", "nsequence",
|
---|
239 | "-p", String.format("%d",offset_),
|
---|
240 | "-n", String.format("%d",num_matches_within_track),
|
---|
241 | "-l", String.format("%d",length_),
|
---|
242 | "-r", String.format("%d",max_docs_),
|
---|
243 | "-f", full_chr12_filename
|
---|
244 | };
|
---|
245 |
|
---|
246 | System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
|
---|
247 |
|
---|
248 | Runtime runtime = Runtime.getRuntime();
|
---|
249 | try {
|
---|
250 | Process wekaDB_proc = runtime.exec(cmd_array);
|
---|
251 | //int exitVal = wekaDB_proc.waitFor();
|
---|
252 | //System.err.println("*** exit status = " + exitVal);
|
---|
253 |
|
---|
254 | InputStream wis = wekaDB_proc.getInputStream();
|
---|
255 | InputStreamReader wisr = new InputStreamReader(wis);
|
---|
256 | BufferedReader wbr = new BufferedReader(wisr);
|
---|
257 |
|
---|
258 | query_result_ = new Vector();
|
---|
259 |
|
---|
260 | boolean first_entry = true;
|
---|
261 | int line_count = 0;
|
---|
262 |
|
---|
263 | String root_doc_id = null;
|
---|
264 | Vector<Double> rankVector = new Vector<Double>();
|
---|
265 | Vector<Integer> offsetVector = new Vector<Integer>();
|
---|
266 |
|
---|
267 | // Example output
|
---|
268 | // D8 0.00105175
|
---|
269 | // 1.69786e-16 392 392
|
---|
270 | // 0.00113568 392 673
|
---|
271 | // 0.00127239 392 910
|
---|
272 | // 0.00139736 392 481
|
---|
273 | // 0.00145331 392 303
|
---|
274 | // D2 0.00429758
|
---|
275 | // 0.00403335 392 865
|
---|
276 | // 0.00411288 392 458
|
---|
277 | // 0.00442461 392 866
|
---|
278 | // 0.00444272 392 864
|
---|
279 | // 0.00447434 392 424
|
---|
280 | // ...
|
---|
281 |
|
---|
282 | String line;
|
---|
283 | while ((line = wbr.readLine()) != null) {
|
---|
284 | String[] tokens = line.split("\\s+");
|
---|
285 | line_count++;
|
---|
286 |
|
---|
287 | if (tokens.length==2) {
|
---|
288 | // processing a top-level doc line
|
---|
289 |
|
---|
290 | if (line_count>1) {
|
---|
291 | // struck new top-level entry => store vector vals for previous block
|
---|
292 |
|
---|
293 | first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
294 | // and now reset vectors to empty to be ready for next chain of values
|
---|
295 | rankVector = new Vector<Double>();
|
---|
296 | offsetVector = new Vector<Integer>();
|
---|
297 | }
|
---|
298 |
|
---|
299 | root_doc_id = tokens[0];
|
---|
300 | }
|
---|
301 | else {
|
---|
302 | // should be 3 items
|
---|
303 | double euclidean_dist = Double.parseDouble(tokens[0]);
|
---|
304 | int src_frame = Integer.parseInt(tokens[1]);
|
---|
305 | int target_frame = Integer.parseInt(tokens[2]);
|
---|
306 |
|
---|
307 | // ****
|
---|
308 |
|
---|
309 | // enforce 1.0 as upper limit due to rounding errors
|
---|
310 | // in audioDB distance calculations
|
---|
311 | double rank = Math.min(1.0 - euclidean_dist,1.0);
|
---|
312 |
|
---|
313 | if ((line_count==2) && (src_frame==target_frame)) {
|
---|
314 | // Found match with self
|
---|
315 | continue;
|
---|
316 | }
|
---|
317 |
|
---|
318 | rankVector.add(rank);
|
---|
319 | offsetVector.add(target_frame);
|
---|
320 | }
|
---|
321 |
|
---|
322 | }
|
---|
323 |
|
---|
324 | addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
325 |
|
---|
326 | wbr.close();
|
---|
327 |
|
---|
328 | // sort query_result_ on 'rank' field
|
---|
329 | // note: compareTo() method impelemented to sort into descending order
|
---|
330 |
|
---|
331 | Collections.sort(query_result_);
|
---|
332 |
|
---|
333 |
|
---|
334 | }
|
---|
335 | catch (Exception e) {
|
---|
336 | logger.error("Failed to execute the following command: " + String.join(" ", cmd_array));
|
---|
337 | e.printStackTrace();
|
---|
338 | }
|
---|
339 |
|
---|
340 | }
|
---|
341 |
|
---|
342 |
|
---|
343 | /** get the result out of the wrapper */
|
---|
344 | public Vector getQueryResult()
|
---|
345 | {
|
---|
346 | return query_result_;
|
---|
347 | }
|
---|
348 | }
|
---|
349 |
|
---|