[35177] | 1 | /*
|
---|
| 2 | * WekaDBWrapper.java
|
---|
| 3 | * Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
|
---|
| 4 | *
|
---|
| 5 | * This program is free software; you can redistribute it and/or modify
|
---|
| 6 | * it under the terms of the GNU General Public License as published by
|
---|
| 7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 8 | * (at your option) any later version.
|
---|
| 9 | *
|
---|
| 10 | * This program is distributed in the hope that it will be useful,
|
---|
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 13 | * GNU General Public License for more details.
|
---|
| 14 | *
|
---|
| 15 | * You should have received a copy of the GNU General Public License
|
---|
| 16 | * along with this program; if not, write to the Free Software
|
---|
| 17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 18 | */
|
---|
| 19 | package org.greenstone.gsdl3.util;
|
---|
| 20 |
|
---|
| 21 | import java.io.*;
|
---|
| 22 | import java.util.Vector;
|
---|
| 23 | import java.util.Collections;
|
---|
[36857] | 24 | import java.util.regex.Pattern;
|
---|
| 25 | import java.util.regex.Matcher;
|
---|
[35177] | 26 |
|
---|
[36857] | 27 | import weka.core.Attribute;
|
---|
| 28 | import weka.core.Instance;
|
---|
| 29 | import weka.core.Instances;
|
---|
| 30 |
|
---|
[35177] | 31 | import org.apache.log4j.*;
|
---|
| 32 |
|
---|
[36857] | 33 | import org.greenstone.gsdl3.util.WekaFindInstanceKNN;
|
---|
| 34 |
|
---|
[35177] | 35 | /** Java wrapper class for access to the Weka
|
---|
| 36 | * Devised (in the first instance) to operate as: java -jar weka.jar <arg1> <arg2>
|
---|
| 37 | *
|
---|
| 38 | * Inspired by MGSearchWrapper.java
|
---|
| 39 | */
|
---|
| 40 |
|
---|
| 41 | public class WekaDBWrapper
|
---|
| 42 | {
|
---|
| 43 | /** the query result, filled in by runQuery */
|
---|
| 44 | protected Vector query_result_;
|
---|
| 45 |
|
---|
| 46 | protected int offset_ = 100;
|
---|
| 47 | protected int length_ = 20;
|
---|
| 48 |
|
---|
| 49 | // Approximate matching not yet utilized
|
---|
| 50 | protected double radius_;
|
---|
| 51 |
|
---|
| 52 | protected int max_docs_;
|
---|
| 53 |
|
---|
| 54 | static Logger logger = Logger.getLogger (org.greenstone.gsdl3.util.WekaDBWrapper.class.getName ());
|
---|
| 55 |
|
---|
| 56 | public WekaDBWrapper() {
|
---|
| 57 | query_result_ = null;
|
---|
| 58 | }
|
---|
| 59 |
|
---|
| 60 | // query param methods
|
---|
| 61 |
|
---|
| 62 | /** start point (offset) into the array of feature vectors for a track
|
---|
| 63 | - 100 by default which equals 10 seconds (assuming 0.1 frame size) */
|
---|
| 64 | public void setOffset(int offset) {
|
---|
| 65 | offset_ = offset;
|
---|
| 66 | }
|
---|
| 67 |
|
---|
| 68 | /** the number of consecutive frames used in match
|
---|
| 69 | - 20 by default which equals 2 seconds (assuming 0.1 frame size) */
|
---|
| 70 | public void setLength(int length) {
|
---|
| 71 | length_ = length;
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | /** distance used in approximate matching support - default is 50 */
|
---|
| 75 | public void setRadius(double radius) {
|
---|
| 76 | radius_ = radius;
|
---|
| 77 | }
|
---|
| 78 |
|
---|
| 79 | public void setMaxDocs(int max_docs) {
|
---|
| 80 | max_docs_ = max_docs;
|
---|
| 81 | }
|
---|
| 82 |
|
---|
| 83 | /** returns a string with all the current query param settings */
|
---|
| 84 | // the following was in MG version, do we need this in WekaDB version? // ****
|
---|
| 85 | //public String getQueryParams() {}
|
---|
| 86 |
|
---|
| 87 |
|
---|
| 88 | protected boolean addQueryResult(boolean first_entry, String doc_id,
|
---|
| 89 | Vector<Double> rankVector, Vector<Integer> offsetVector)
|
---|
| 90 | {
|
---|
| 91 |
|
---|
| 92 | if (first_entry) {
|
---|
| 93 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
|
---|
| 94 | query_result_.add(wekaDB_doc_info);
|
---|
| 95 | first_entry = false;
|
---|
| 96 | }
|
---|
| 97 | else {
|
---|
| 98 | double rank = rankVector.get(0);
|
---|
| 99 | int offset = offsetVector.get(0);
|
---|
| 100 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
|
---|
| 101 |
|
---|
| 102 | query_result_.add(wekaDB_doc_info);
|
---|
| 103 | }
|
---|
| 104 |
|
---|
| 105 | return first_entry;
|
---|
| 106 | }
|
---|
| 107 |
|
---|
| 108 |
|
---|
| 109 | /** actually carry out the query.
|
---|
| 110 | Use the set methods to set query results.
|
---|
| 111 | Writes the result to query_result.
|
---|
| 112 | * - maintains state between requests as can be slow
|
---|
| 113 | * base_dir and index_path should join together to provide
|
---|
| 114 | * the absolute location of the mg index files eg ..../index/dtx/demo
|
---|
| 115 | * base_dir must end with a file separator (OS dependant)
|
---|
| 116 | */
|
---|
[36857] | 117 |
|
---|
| 118 |
|
---|
[35177] | 119 | public void runQuery(String wekaDB_index_dir, String knn_model_file,
|
---|
| 120 | String assoc_index_dir, String query_string) {
|
---|
| 121 |
|
---|
| 122 | // combine index_dir with audiodb fileanem
|
---|
| 123 |
|
---|
| 124 | String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
|
---|
[36857] | 125 |
|
---|
| 126 | //String full_chr12_filename = assoc_index_dir + File.separatorChar
|
---|
| 127 | // + query_string + File.separatorChar + "doc.chr12";
|
---|
| 128 |
|
---|
| 129 | System.err.println("**** full knn model filename = " + full_knn_model_filename);
|
---|
| 130 |
|
---|
| 131 | // Example returned result from Weka KNN
|
---|
| 132 | // => first line is the input instance ('filename+segment',Arousal,Valence)
|
---|
| 133 | // following (indented lines) nearest neighbour matches in same format
|
---|
| 134 | //
|
---|
| 135 | // ds_22716_5743-6,-0.549489,-0.118439
|
---|
| 136 | // ds_22716_5743-6,-0.549489,-0.118439
|
---|
| 137 | // ds_31008_6550-30,-0.549489,-0.118439
|
---|
| 138 | // ds_72651_26831-6,-0.549489,-0.118439
|
---|
| 139 | // ds_26196_9214-18,-0.549489,-0.118439
|
---|
| 140 |
|
---|
| 141 |
|
---|
| 142 | WekaFindInstanceKNN.init(full_knn_model_filename);
|
---|
| 143 |
|
---|
| 144 | //Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours("ds_22716_5743",6,-0.549489,-0.118439,3);
|
---|
| 145 | String doc_id = query_string;
|
---|
| 146 |
|
---|
| 147 | int segment = 6;
|
---|
| 148 | //String segment_str = Integer.toString(segment);
|
---|
| 149 | String doc_id_segment = doc_id +"-" + segment;
|
---|
| 150 |
|
---|
| 151 | double arousal_val = -0.549489;
|
---|
| 152 | double valence_val = -0.118439;
|
---|
| 153 | int k_nearest_num = 3;
|
---|
| 154 |
|
---|
| 155 | System.err.println("doc_id_segment = " + doc_id_segment);
|
---|
| 156 |
|
---|
| 157 |
|
---|
| 158 | Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours(doc_id_segment,arousal_val,valence_val,k_nearest_num);
|
---|
| 159 |
|
---|
| 160 |
|
---|
| 161 | query_result_ = new Vector();
|
---|
| 162 |
|
---|
| 163 | /*
|
---|
| 164 | WekaDBDocInfo wekaDB_doc_info1 = new WekaDBDocInfo("ds_20415_2337",0.9,4);
|
---|
| 165 | query_result_.add(wekaDB_doc_info1);
|
---|
| 166 |
|
---|
| 167 | WekaDBDocInfo wekaDB_doc_info2 = new WekaDBDocInfo("ds_51017_15513",0.87,1);
|
---|
| 168 | query_result_.add(wekaDB_doc_info2);
|
---|
| 169 |
|
---|
| 170 | WekaDBDocInfo wekaDB_doc_info3 = new WekaDBDocInfo("ds_20415_2337",0.82,6);
|
---|
| 171 | query_result_.add(wekaDB_doc_info3);
|
---|
| 172 | */
|
---|
| 173 |
|
---|
| 174 | int clamped_k_nearest_num = Math.min(k_nearest_num,nearest_instances.size());
|
---|
| 175 |
|
---|
| 176 | for (int i=0; i<clamped_k_nearest_num; i++) {
|
---|
| 177 | Instance instance = nearest_instances.instance(i);
|
---|
| 178 | logger.info("\tProcessing returned instance: " + instance);
|
---|
| 179 |
|
---|
| 180 | //Attribute doc_id_segment_att = instance.attribute(0);
|
---|
| 181 | //String matching_doc_id_segment = instance.attribute(0).value(0);
|
---|
| 182 | String matching_doc_id_segment = instance.stringValue(0);
|
---|
| 183 | //double arousal_val = instance.attribute(1);
|
---|
| 184 | //double valence_val = instance.attribute(2);
|
---|
| 185 |
|
---|
| 186 |
|
---|
| 187 | Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
|
---|
| 188 | Matcher m = p.matcher(matching_doc_id_segment);
|
---|
| 189 | if (m.matches()) {
|
---|
| 190 |
|
---|
| 191 | String matching_doc_id = m.group(1);
|
---|
| 192 | int matching_segment_offset = Integer.parseInt(m.group(2));
|
---|
| 193 |
|
---|
| 194 | double matching_rank = 0.9;
|
---|
| 195 |
|
---|
| 196 | logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
|
---|
| 197 | WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
|
---|
| 198 | query_result_.add(wekaDB_doc_info);
|
---|
| 199 | }
|
---|
| 200 | else {
|
---|
| 201 | logger.error("Returned AV k-nearest neighbour match '"+doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
|
---|
| 202 | }
|
---|
| 203 | }
|
---|
| 204 |
|
---|
| 205 | /*
|
---|
| 206 | int num_matches_within_track = 6;
|
---|
| 207 |
|
---|
| 208 |
|
---|
| 209 | first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
| 210 | // and now reset vectors to empty to be ready for next chain of values
|
---|
| 211 | rankVector = new Vector<Double>();
|
---|
| 212 | offsetVector = new Vector<Integer>();
|
---|
| 213 |
|
---|
| 214 |
|
---|
| 215 | rankVector.add(rank);
|
---|
| 216 | offsetVector.add(target_frame);
|
---|
| 217 |
|
---|
| 218 | addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
| 219 | */
|
---|
| 220 |
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
|
---|
| 224 | String assoc_index_dir, String query_string) {
|
---|
| 225 |
|
---|
| 226 | // combine index_dir with audiodb fileanem
|
---|
| 227 |
|
---|
| 228 | String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
|
---|
[35177] | 229 | String full_chr12_filename = assoc_index_dir + File.separatorChar
|
---|
| 230 | + query_string + File.separatorChar + "doc.chr12";
|
---|
| 231 |
|
---|
| 232 | int num_matches_within_track = 6;
|
---|
| 233 |
|
---|
| 234 | // ****
|
---|
| 235 | String [] cmd_array = new String[] {
|
---|
[36857] | 236 | "java", "-jar", "weka.jar",
|
---|
[35177] | 237 | "-d", full_knn_model_filename,
|
---|
| 238 | "-Q", "nsequence",
|
---|
| 239 | "-p", String.format("%d",offset_),
|
---|
| 240 | "-n", String.format("%d",num_matches_within_track),
|
---|
| 241 | "-l", String.format("%d",length_),
|
---|
| 242 | "-r", String.format("%d",max_docs_),
|
---|
| 243 | "-f", full_chr12_filename
|
---|
| 244 | };
|
---|
| 245 |
|
---|
| 246 | System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
|
---|
| 247 |
|
---|
| 248 | Runtime runtime = Runtime.getRuntime();
|
---|
| 249 | try {
|
---|
| 250 | Process wekaDB_proc = runtime.exec(cmd_array);
|
---|
| 251 | //int exitVal = wekaDB_proc.waitFor();
|
---|
| 252 | //System.err.println("*** exit status = " + exitVal);
|
---|
| 253 |
|
---|
| 254 | InputStream wis = wekaDB_proc.getInputStream();
|
---|
| 255 | InputStreamReader wisr = new InputStreamReader(wis);
|
---|
| 256 | BufferedReader wbr = new BufferedReader(wisr);
|
---|
| 257 |
|
---|
| 258 | query_result_ = new Vector();
|
---|
| 259 |
|
---|
| 260 | boolean first_entry = true;
|
---|
| 261 | int line_count = 0;
|
---|
| 262 |
|
---|
| 263 | String root_doc_id = null;
|
---|
| 264 | Vector<Double> rankVector = new Vector<Double>();
|
---|
| 265 | Vector<Integer> offsetVector = new Vector<Integer>();
|
---|
| 266 |
|
---|
| 267 | // Example output
|
---|
| 268 | // D8 0.00105175
|
---|
| 269 | // 1.69786e-16 392 392
|
---|
| 270 | // 0.00113568 392 673
|
---|
| 271 | // 0.00127239 392 910
|
---|
| 272 | // 0.00139736 392 481
|
---|
| 273 | // 0.00145331 392 303
|
---|
| 274 | // D2 0.00429758
|
---|
| 275 | // 0.00403335 392 865
|
---|
| 276 | // 0.00411288 392 458
|
---|
| 277 | // 0.00442461 392 866
|
---|
| 278 | // 0.00444272 392 864
|
---|
| 279 | // 0.00447434 392 424
|
---|
| 280 | // ...
|
---|
| 281 |
|
---|
| 282 | String line;
|
---|
| 283 | while ((line = wbr.readLine()) != null) {
|
---|
| 284 | String[] tokens = line.split("\\s+");
|
---|
| 285 | line_count++;
|
---|
| 286 |
|
---|
| 287 | if (tokens.length==2) {
|
---|
| 288 | // processing a top-level doc line
|
---|
| 289 |
|
---|
| 290 | if (line_count>1) {
|
---|
| 291 | // struck new top-level entry => store vector vals for previous block
|
---|
| 292 |
|
---|
| 293 | first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
| 294 | // and now reset vectors to empty to be ready for next chain of values
|
---|
| 295 | rankVector = new Vector<Double>();
|
---|
| 296 | offsetVector = new Vector<Integer>();
|
---|
| 297 | }
|
---|
| 298 |
|
---|
| 299 | root_doc_id = tokens[0];
|
---|
| 300 | }
|
---|
| 301 | else {
|
---|
| 302 | // should be 3 items
|
---|
| 303 | double euclidean_dist = Double.parseDouble(tokens[0]);
|
---|
| 304 | int src_frame = Integer.parseInt(tokens[1]);
|
---|
| 305 | int target_frame = Integer.parseInt(tokens[2]);
|
---|
| 306 |
|
---|
| 307 | // ****
|
---|
| 308 |
|
---|
| 309 | // enforce 1.0 as upper limit due to rounding errors
|
---|
| 310 | // in audioDB distance calculations
|
---|
| 311 | double rank = Math.min(1.0 - euclidean_dist,1.0);
|
---|
| 312 |
|
---|
| 313 | if ((line_count==2) && (src_frame==target_frame)) {
|
---|
| 314 | // Found match with self
|
---|
| 315 | continue;
|
---|
| 316 | }
|
---|
| 317 |
|
---|
| 318 | rankVector.add(rank);
|
---|
| 319 | offsetVector.add(target_frame);
|
---|
| 320 | }
|
---|
| 321 |
|
---|
| 322 | }
|
---|
| 323 |
|
---|
| 324 | addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
|
---|
| 325 |
|
---|
| 326 | wbr.close();
|
---|
| 327 |
|
---|
| 328 | // sort query_result_ on 'rank' field
|
---|
| 329 | // note: compareTo() method impelemented to sort into descending order
|
---|
| 330 |
|
---|
| 331 | Collections.sort(query_result_);
|
---|
| 332 |
|
---|
| 333 |
|
---|
| 334 | }
|
---|
| 335 | catch (Exception e) {
|
---|
| 336 | logger.error("Failed to execute the following command: " + String.join(" ", cmd_array));
|
---|
| 337 | e.printStackTrace();
|
---|
| 338 | }
|
---|
| 339 |
|
---|
| 340 | }
|
---|
| 341 |
|
---|
| 342 |
|
---|
| 343 | /** get the result out of the wrapper */
|
---|
| 344 | public Vector getQueryResult()
|
---|
| 345 | {
|
---|
| 346 | return query_result_;
|
---|
| 347 | }
|
---|
| 348 | }
|
---|
| 349 |
|
---|