Context Navigation

source: gs3-extensions/mars-src/trunk/src/java/org/greenstone/gsdl3/util/WekaDBWrapper.java@ 36857

Last change on this file since 36857 was 36857, checked in by davidb, 19 months ago
Service now showing some basic functionality to retrieval songs
File size: 10.7 KB

Line
1	/*
2	* WekaDBWrapper.java
3	* Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	import java.io.*;
22	import java.util.Vector;
23	import java.util.Collections;
24	import java.util.regex.Pattern;
25	import java.util.regex.Matcher;
26
27	import weka.core.Attribute;
28	import weka.core.Instance;
29	import weka.core.Instances;
30
31	import org.apache.log4j.*;
32
33	import org.greenstone.gsdl3.util.WekaFindInstanceKNN;
34
35	/** Java wrapper class for access to the Weka
36	* Devised (in the first instance) to operate as: java -jar weka.jar <arg1> <arg2>
37	*
38	* Inspired by MGSearchWrapper.java
39	*/
40
41	public class WekaDBWrapper
42	{
43	/** the query result, filled in by runQuery */
44	protected Vector query_result_;
45
46	protected int offset_ = 100;
47	protected int length_ = 20;
48
49	// Approximate matching not yet utilized
50	protected double radius_;
51
52	protected int max_docs_;
53
54	static Logger logger = Logger.getLogger (org.greenstone.gsdl3.util.WekaDBWrapper.class.getName ());
55
56	public WekaDBWrapper() {
57	query_result_ = null;
58	}
59
60	// query param methods
61
62	/** start point (offset) into the array of feature vectors for a track
63	- 100 by default which equals 10 seconds (assuming 0.1 frame size) */
64	public void setOffset(int offset) {
65	offset_ = offset;
66	}
67
68	/** the number of consecutive frames used in match
69	- 20 by default which equals 2 seconds (assuming 0.1 frame size) */
70	public void setLength(int length) {
71	length_ = length;
72	}
73
74	/** distance used in approximate matching support - default is 50 */
75	public void setRadius(double radius) {
76	radius_ = radius;
77	}
78
79	public void setMaxDocs(int max_docs) {
80	max_docs_ = max_docs;
81	}
82
83	/** returns a string with all the current query param settings */
84	// the following was in MG version, do we need this in WekaDB version? // ****
85	//public String getQueryParams() {}
86
87
88	protected boolean addQueryResult(boolean first_entry, String doc_id,
89	Vector<Double> rankVector, Vector<Integer> offsetVector)
90	{
91
92	if (first_entry) {
93	WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rankVector,offsetVector);
94	query_result_.add(wekaDB_doc_info);
95	first_entry = false;
96	}
97	else {
98	double rank = rankVector.get(0);
99	int offset = offsetVector.get(0);
100	WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(doc_id,rank,offset);
101
102	query_result_.add(wekaDB_doc_info);
103	}
104
105	return first_entry;
106	}
107
108
109	/** actually carry out the query.
110	Use the set methods to set query results.
111	Writes the result to query_result.
112	* - maintains state between requests as can be slow
113	* base_dir and index_path should join together to provide
114	* the absolute location of the mg index files eg ..../index/dtx/demo
115	* base_dir must end with a file separator (OS dependant)
116	*/
117
118
119	public void runQuery(String wekaDB_index_dir, String knn_model_file,
120	String assoc_index_dir, String query_string) {
121
122	// combine index_dir with audiodb fileanem
123
124	String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
125
126	//String full_chr12_filename = assoc_index_dir + File.separatorChar
127	// + query_string + File.separatorChar + "doc.chr12";
128
129	System.err.println("**** full knn model filename = " + full_knn_model_filename);
130
131	// Example returned result from Weka KNN
132	// => first line is the input instance ('filename+segment',Arousal,Valence)
133	// following (indented lines) nearest neighbour matches in same format
134	//
135	// ds_22716_5743-6,-0.549489,-0.118439
136	// ds_22716_5743-6,-0.549489,-0.118439
137	// ds_31008_6550-30,-0.549489,-0.118439
138	// ds_72651_26831-6,-0.549489,-0.118439
139	// ds_26196_9214-18,-0.549489,-0.118439
140
141
142	WekaFindInstanceKNN.init(full_knn_model_filename);
143
144	//Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours("ds_22716_5743",6,-0.549489,-0.118439,3);
145	String doc_id = query_string;
146
147	int segment = 6;
148	//String segment_str = Integer.toString(segment);
149	String doc_id_segment = doc_id +"-" + segment;
150
151	double arousal_val = -0.549489;
152	double valence_val = -0.118439;
153	int k_nearest_num = 3;
154
155	System.err.println("doc_id_segment = " + doc_id_segment);
156
157
158	Instances nearest_instances = WekaFindInstanceKNN.kNearestNeighbours(doc_id_segment,arousal_val,valence_val,k_nearest_num);
159
160
161	query_result_ = new Vector();
162
163	/*
164	WekaDBDocInfo wekaDB_doc_info1 = new WekaDBDocInfo("ds_20415_2337",0.9,4);
165	query_result_.add(wekaDB_doc_info1);
166
167	WekaDBDocInfo wekaDB_doc_info2 = new WekaDBDocInfo("ds_51017_15513",0.87,1);
168	query_result_.add(wekaDB_doc_info2);
169
170	WekaDBDocInfo wekaDB_doc_info3 = new WekaDBDocInfo("ds_20415_2337",0.82,6);
171	query_result_.add(wekaDB_doc_info3);
172	*/
173
174	int clamped_k_nearest_num = Math.min(k_nearest_num,nearest_instances.size());
175
176	for (int i=0; i<clamped_k_nearest_num; i++) {
177	Instance instance = nearest_instances.instance(i);
178	logger.info("\tProcessing returned instance: " + instance);
179
180	//Attribute doc_id_segment_att = instance.attribute(0);
181	//String matching_doc_id_segment = instance.attribute(0).value(0);
182	String matching_doc_id_segment = instance.stringValue(0);
183	//double arousal_val = instance.attribute(1);
184	//double valence_val = instance.attribute(2);
185
186
187	Pattern p = Pattern.compile("^(\\w+)-(\\d+)$");
188	Matcher m = p.matcher(matching_doc_id_segment);
189	if (m.matches()) {
190
191	String matching_doc_id = m.group(1);
192	int matching_segment_offset = Integer.parseInt(m.group(2));
193
194	double matching_rank = 0.9;
195
196	logger.info("\tAdding in: matching_doc_id = " + matching_doc_id);
197	WekaDBDocInfo wekaDB_doc_info = new WekaDBDocInfo(matching_doc_id,matching_rank,matching_segment_offset);
198	query_result_.add(wekaDB_doc_info);
199	}
200	else {
201	logger.error("Returned AV k-nearest neighbour match '"+doc_id_segment+"' could not be parsed as <doc-id>-<segment>" );
202	}
203	}
204
205	/*
206	int num_matches_within_track = 6;
207
208
209	first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
210	// and now reset vectors to empty to be ready for next chain of values
211	rankVector = new Vector<Double>();
212	offsetVector = new Vector<Integer>();
213
214
215	rankVector.add(rank);
216	offsetVector.add(target_frame);
217
218	addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
219	*/
220
221	}
222
223	public void runQueryOLD(String wekaDB_index_dir, String knn_model_file,
224	String assoc_index_dir, String query_string) {
225
226	// combine index_dir with audiodb fileanem
227
228	String full_knn_model_filename = wekaDB_index_dir + File.separatorChar + knn_model_file;
229	String full_chr12_filename = assoc_index_dir + File.separatorChar
230	+ query_string + File.separatorChar + "doc.chr12";
231
232	int num_matches_within_track = 6;
233
234	// ****
235	String [] cmd_array = new String[] {
236	"java", "-jar", "weka.jar",
237	"-d", full_knn_model_filename,
238	"-Q", "nsequence",
239	"-p", String.format("%d",offset_),
240	"-n", String.format("%d",num_matches_within_track),
241	"-l", String.format("%d",length_),
242	"-r", String.format("%d",max_docs_),
243	"-f", full_chr12_filename
244	};
245
246	System.err.println("**** cmd_array = " + String.join(" ", cmd_array));
247
248	Runtime runtime = Runtime.getRuntime();
249	try {
250	Process wekaDB_proc = runtime.exec(cmd_array);
251	//int exitVal = wekaDB_proc.waitFor();
252	//System.err.println("*** exit status = " + exitVal);
253
254	InputStream wis = wekaDB_proc.getInputStream();
255	InputStreamReader wisr = new InputStreamReader(wis);
256	BufferedReader wbr = new BufferedReader(wisr);
257
258	query_result_ = new Vector();
259
260	boolean first_entry = true;
261	int line_count = 0;
262
263	String root_doc_id = null;
264	Vector<Double> rankVector = new Vector<Double>();
265	Vector<Integer> offsetVector = new Vector<Integer>();
266
267	// Example output
268	// D8 0.00105175
269	// 1.69786e-16 392 392
270	// 0.00113568 392 673
271	// 0.00127239 392 910
272	// 0.00139736 392 481
273	// 0.00145331 392 303
274	// D2 0.00429758
275	// 0.00403335 392 865
276	// 0.00411288 392 458
277	// 0.00442461 392 866
278	// 0.00444272 392 864
279	// 0.00447434 392 424
280	// ...
281
282	String line;
283	while ((line = wbr.readLine()) != null) {
284	String[] tokens = line.split("\\s+");
285	line_count++;
286
287	if (tokens.length==2) {
288	// processing a top-level doc line
289
290	if (line_count>1) {
291	// struck new top-level entry => store vector vals for previous block
292
293	first_entry = addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
294	// and now reset vectors to empty to be ready for next chain of values
295	rankVector = new Vector<Double>();
296	offsetVector = new Vector<Integer>();
297	}
298
299	root_doc_id = tokens[0];
300	}
301	else {
302	// should be 3 items
303	double euclidean_dist = Double.parseDouble(tokens[0]);
304	int src_frame = Integer.parseInt(tokens[1]);
305	int target_frame = Integer.parseInt(tokens[2]);
306
307	// ****
308
309	// enforce 1.0 as upper limit due to rounding errors
310	// in audioDB distance calculations
311	double rank = Math.min(1.0 - euclidean_dist,1.0);
312
313	if ((line_count==2) && (src_frame==target_frame)) {
314	// Found match with self
315	continue;
316	}
317
318	rankVector.add(rank);
319	offsetVector.add(target_frame);
320	}
321
322	}
323
324	addQueryResult(first_entry,root_doc_id,rankVector,offsetVector);
325
326	wbr.close();
327
328	// sort query_result_ on 'rank' field
329	// note: compareTo() method impelemented to sort into descending order
330
331	Collections.sort(query_result_);
332
333
334	}
335	catch (Exception e) {
336	logger.error("Failed to execute the following command: " + String.join(" ", cmd_array));
337	e.printStackTrace();
338	}
339
340	}
341
342
343	/** get the result out of the wrapper */
344	public Vector getQueryResult()
345	{
346	return query_result_;
347	}
348	}
349

Note: See TracBrowser for help on using the repository browser.

Download in other formats: