Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

AudioDocument.java@ 29648

Last change on this file since 29648 was 29647, checked in by jmt12, 9 years ago
Document class to support audio files with the fileindexer application
File size: 12.8 KB

Line
1	/**
2	* Adding support for Audio Files in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2014 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.InputStream;
20	import java.io.StringReader;
21	import java.io.Reader;
22	import java.nio.charset.Charset;
23	import java.nio.file.Files;
24	import java.nio.file.Path;
25	import java.nio.file.Paths;
26	import java.security.MessageDigest;
27	import java.security.NoSuchAlgorithmException;
28	import java.util.Collections;
29	import java.util.Arrays;
30	import java.util.Map;
31	import java.util.Set;
32
33	import org.apache.log4j.Logger;
34	import org.terrier.indexing.StreamGobbler;
35	import org.terrier.indexing.tokenisation.TokenStream;
36	import org.terrier.indexing.tokenisation.Tokeniser;
37	import org.terrier.utility.ApplicationSetup;
38
39	public class AudioDocument
40	implements Document
41	{
42	protected boolean debug = false;
43	/** A reference to the logger for messaging */
44	protected static final Logger logger = Logger.getLogger(FileDocument.class);
45	/** The map of properties (fields) for this document. */
46	protected Map<String,String> properties;
47	/** A reader built from a dummy text string. */
48	protected Reader reader;
49	/** A token stream produced by the configured tokeniser when feed the dummy
50	* reader.
51	*/
52	protected TokenStream tokenizer;
53
54	/ The streaming filetype. /
55	protected final String preview_format = ApplicationSetup.getProperty("AudioDocument.streaming_format", "flv");
56	/ Optional feature extraction /
57	protected String calculate_fft_features = ApplicationSetup.getProperty("AudioDocument.fft_features", "true");
58	protected String calculate_mir_features = ApplicationSetup.getProperty("AudioDocument.mir_features", "true");
59
60	/ Default constructor. /
61	protected AudioDocument() {}
62
63	/** Constructs an instance of the AudioDocument from the given input stream.
64	* @param docStream the input stream that reads the file.
65	* @param docProperties the initial properties (docno, filename)
66	* @param tok the tokeniser defined for this collection
67	*/
68	public AudioDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
69	{
70	if (this.debug)
71	{
72	logger.info("AudioDocument::AudioDocument()");
73	}
74	// Initialization from arguments
75	this.properties = default_properties;
76
77	// Set properties
78	if (this.debug)
79	{
80	logger.info("AudioDocument - current properties");
81	for (Map.Entry<String, String> entry : this.properties.entrySet())
82	{
83	logger.info(entry.getKey() + "=" + entry.getValue());
84	}
85	}
86
87	if (this.debug)
88	{
89	logger.info("ImageDocument - extracting properties");
90	}
91	// A. Hardcoded properties
92	this.properties.put("parser", "AudioDocument");
93	this.properties.put("abstract", "This is audio so here is some dummy text to prevent indexer failing.");
94	// B. Properties derived from filename
95	// - A simple title for the document
96	String filepath = this.properties.get("filename");
97	String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
98	this.properties.put("title", title);
99	String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
100	// - The name of the copy of the original document
101	String target_filename = "doc." + ext;
102	this.properties.put("source","doc." + ext);
103	// - A unique associated directory. This gets a little tricky as we need
104	// to create the directory at the same time if an effort to promote
105	// synchronous behaviour
106	String unique_id = this.generateHash(filepath);
107	// - we start with the first 4 characters
108	int offset = 0;
109	String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
110	// - we add ".dir" as a suffix to the directory that actually contains
111	// files (so the non-suffixed version contains nested directories)
112	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
113	// - then we continue adding blocks of 4 characters until we get a
114	// directory that doesn't already exist
115	while (assoc_path.toFile().exists() && offset < unique_id.length())
116	{
117	offset += 4;
118	assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
119	assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
120	}
121	// - still not unique? but run out of unique_id... time to complain
122	if (assoc_path.toFile().exists())
123	{
124	logger.error("ImageDoument - can't determine unique assocfilepath");
125	System.exit(0);
126	}
127	// - create the directories quick... hopefully before someone else does
128	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
129	this.properties.put("assocfile", assoc_filename);
130
131	// Copy (symlink) the file into place in the shared directory
132	Path source_path = Paths.get(properties.get("filename"));
133	Path target_path = assoc_path.resolve(target_filename);
134	if (target_path.toFile().exists())
135	{
136	if (this.debug)
137	{
138	logger.info("AudioDocument - removing existing (old) associated image");
139	}
140	try
141	{
142	Files.delete(target_path);
143	}
144	catch (Exception e)
145	{
146	logger.error("Exception while deleting old image: ", e);
147	}
148	}
149	if (this.debug)
150	{
151	logger.info("AudioDocument - symlinking image into assoc directory");
152	}
153	try
154	{
155	Files.createSymbolicLink(target_path, source_path);
156	}
157	// not supported? We'll try copying below
158	catch (UnsupportedOperationException ex)
159	{
160	}
161	// All other exceptions can be fatal
162	catch (Exception e)
163	{
164	logger.error("Exception while symlinking image: ", e);
165	}
166	// - copy if the file doesn't exist yet
167	if (!target_path.toFile().exists())
168	{
169	if (this.debug)
170	{
171	logger.info("ImageDocument - symlink failed, copying instead");
172	}
173	try
174	{
175	Files.copy(source_path, target_path);
176	}
177	// Fatality!
178	catch (Exception e)
179	{
180	logger.error("Exception while copying image: ", e);
181	}
182	}
183
184	// Generate streaming audio version
185	if (this.debug)
186	{
187	logger.info("AudioDocument - generate streaming version");
188	}
189	try
190	{
191	String streaming_filename = this.generatePreview(source_path, assoc_path);
192	this.properties.put("streaming", streaming_filename);
193	}
194	catch (Exception e)
195	{
196	logger.error("Exception while generating preview image: ", e);
197	}
198
199	// Extracting Fast Fourier Transform features
200	if (this.calculate_fft_features.equals("true"))
201	{
202	if (this.debug)
203	{
204	logger.info("AudioDocument - generate and record FFT features");
205	}
206	try
207	{
208	String chroma_filename = this.calculateFFTFeatures(source_path, assoc_path, "chroma");
209	this.properties.put("fft-chroma", chroma_filename);
210	String powerlog_filename = this.calculateFFTFeatures(source_path, assoc_path, "power-log");
211	this.properties.put("fft-power-log", powerlog_filename);
212	}
213	catch (Exception e)
214	{
215	logger.error("Exception while generating preview image: ", e);
216	}
217	}
218
219	// Extracting Music IR features
220	if (this.calculate_mir_features.equals("true"))
221	{
222	if (this.debug)
223	{
224	logger.info("AudioDocument - generate and record MIR features");
225	}
226	try
227	{
228	String mir_filename = this.calculateMIRFeatures(source_path, assoc_path);
229	}
230	catch (Exception e)
231	{
232	logger.error("Exception while generating preview image: ", e);
233	}
234	}
235
236	// Create a dummy reader around some dummy text and then tokenize it
237	if (this.debug)
238	{
239	logger.info("AudioDocument - feed dummy text as token stream to indexer");
240	}
241	try
242	{
243	this.reader = new StringReader(this.properties.get("abstract"));
244	this.tokenizer = tok.tokenise(this.reader);
245	}
246	catch (Exception e)
247	{
248	logger.error("Exception while creating dummy text stream: ", e);
249	}
250
251	if (this.debug)
252	{
253	logger.info("AudioDocument - Complete!");
254	}
255	}
256	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
257
258	/** Returns true when the end of the document has been reached, and there
259	* are no other terms to be retrieved from it.
260	* @return boolean true if there are no more terms in the document, otherwise
261	* it returns false.
262	*/
263	public boolean endOfDocument()
264	{
265	return !this.tokenizer.hasNext();
266	}
267	/ endOfDocument() /
268
269	/** Use ImageMagick to generate a preview image.
270	* @pre assumes you have ImageMagick installed and available on Path
271	* @pre uses member variables preview_format and preview_width
272	* @return the filename of the preview image (within the assoc directory)
273	*/
274	private String generatePreview(Path source_path, Path assoc_path)
275	throws Exception
276	{
277	String preview_filename = "preview." + this.preview_format;
278	Path preview_path = assoc_path.resolve(preview_filename);
279	String convert_command[] = {
280	"convert",
281	source_path.toString(),
282	"-resize",
283	this.preview_width + "x",
284	preview_path.toString()
285	};
286	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
287	Process convert_process = Runtime.getRuntime().exec(convert_command);
288	// Gobble up the streams to prevent them hanging the process when buffers
289	// are full
290	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
291	convert_process_error_gobbler.start();
292	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
293	convert_process_input_gobbler.start();
294	// Let the conversion finish
295	int convert_status = convert_process.waitFor();
296	if (convert_status != 0 \|\| !preview_path.toFile().exists())
297	{
298	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
299	}
300	return preview_filename;
301	}
302	/ generatePreview(Path, Path) /
303
304	/** Returns the underlying map of all the properties defined by this Document.
305	* @since 1.1.0
306	*/
307	public Map<String,String> getAllProperties()
308	{
309	return this.properties;
310	}
311	/ getAllProperties() /
312
313	/** Returns a list of the fields the current term appears in.
314	* @return HashSet a set of the terms that the current term appears in.
315	*/
316	public Set<String> getFields()
317	{
318	// Returns null because there is no support for fields with file documents.
319	return Collections.emptySet();
320	}
321	/ getFields() /
322
323	/** Gets the next term of the document.
324	* <B>NB:</B>Null string returned from getNextTerm() should
325	* be ignored. They do not signify the lack of any more terms.
326	* endOfDocument() should be used to check that.
327	* @return String the next term of the document. Null returns should be
328	* ignored.
329	*/
330	public String getNextTerm()
331	{
332	return this.tokenizer.next();
333	}
334	/ getNextTerm() /
335
336	/** Allows access to a named property of the Document. Examples might be URL,
337	* filename etc.
338	* @param name Name of the property. It is suggested, but not required that
339	* this name should not be case insensitive.
340	* @since 1.1.0
341	*/
342	public String getProperty(String name)
343	{
344	return this.properties.get(name.toLowerCase());
345	}
346	/ getProperty(String name) /
347
348	/** Returns a Reader object so client code can tokenise the document
349	* or deal with the document itself. Examples might be extracting URLs,
350	* language detection. */
351	public Reader getReader()
352	{
353	return this.reader;
354	}
355	/ getReader() /
356
357	/**
358	*/
359	private String generateHash(String string)
360	{
361	StringBuffer sb = new StringBuffer();
362	try
363	{
364	final MessageDigest message_digest = MessageDigest.getInstance("MD5");
365	message_digest.reset();
366	message_digest.update(string.getBytes(Charset.forName("UTF8")));
367	final byte[] result_bytes = message_digest.digest();
368	for (int i = 0; i < result_bytes.length; ++i)
369	{
370	sb.append(Integer.toHexString((result_bytes[i] & 0xFF) \| 0x100).substring(1,3));
371	}
372	}
373	catch (NoSuchAlgorithmException e)
374	{
375	System.err.println("Exception: " + e);
376	System.exit(0);
377	}
378	return sb.toString();
379	}
380	/ generateHash(String) /
381	}
382

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/AudioDocument.java@ 29648

Download in other formats: