source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/AudioDocument.java@ 29648

Last change on this file since 29648 was 29647, checked in by jmt12, 9 years ago

Document class to support audio files with the fileindexer application

File size: 12.8 KB
Line 
1/**
2 * Adding support for Audio Files in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2014 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.charset.Charset;
23import java.nio.file.Files;
24import java.nio.file.Path;
25import java.nio.file.Paths;
26import java.security.MessageDigest;
27import java.security.NoSuchAlgorithmException;
28import java.util.Collections;
29import java.util.Arrays;
30import java.util.Map;
31import java.util.Set;
32
33import org.apache.log4j.Logger;
34import org.terrier.indexing.StreamGobbler;
35import org.terrier.indexing.tokenisation.TokenStream;
36import org.terrier.indexing.tokenisation.Tokeniser;
37import org.terrier.utility.ApplicationSetup;
38
39public class AudioDocument
40 implements Document
41{
42 protected boolean debug = false;
43 /** A reference to the logger for messaging */
44 protected static final Logger logger = Logger.getLogger(FileDocument.class);
45 /** The map of properties (fields) for this document. */
46 protected Map<String,String> properties;
47 /** A reader built from a dummy text string. */
48 protected Reader reader;
49 /** A token stream produced by the configured tokeniser when feed the dummy
50 * reader.
51 */
52 protected TokenStream tokenizer;
53
54 /** The streaming filetype. **/
55 protected final String preview_format = ApplicationSetup.getProperty("AudioDocument.streaming_format", "flv");
56 /** Optional feature extraction **/
57 protected String calculate_fft_features = ApplicationSetup.getProperty("AudioDocument.fft_features", "true");
58 protected String calculate_mir_features = ApplicationSetup.getProperty("AudioDocument.mir_features", "true");
59
60 /** Default constructor. **/
61 protected AudioDocument() {}
62
63 /** Constructs an instance of the AudioDocument from the given input stream.
64 * @param docStream the input stream that reads the file.
65 * @param docProperties the initial properties (docno, filename)
66 * @param tok the tokeniser defined for this collection
67 */
68 public AudioDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
69 {
70 if (this.debug)
71 {
72 logger.info("AudioDocument::AudioDocument()");
73 }
74 // Initialization from arguments
75 this.properties = default_properties;
76
77 // Set properties
78 if (this.debug)
79 {
80 logger.info("AudioDocument - current properties");
81 for (Map.Entry<String, String> entry : this.properties.entrySet())
82 {
83 logger.info(entry.getKey() + "=" + entry.getValue());
84 }
85 }
86
87 if (this.debug)
88 {
89 logger.info("ImageDocument - extracting properties");
90 }
91 // A. Hardcoded properties
92 this.properties.put("parser", "AudioDocument");
93 this.properties.put("abstract", "This is audio so here is some dummy text to prevent indexer failing.");
94 // B. Properties derived from filename
95 // - A simple title for the document
96 String filepath = this.properties.get("filename");
97 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
98 this.properties.put("title", title);
99 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
100 // - The name of the copy of the original document
101 String target_filename = "doc." + ext;
102 this.properties.put("source","doc." + ext);
103 // - A unique associated directory. This gets a little tricky as we need
104 // to create the directory at the same time if an effort to promote
105 // synchronous behaviour
106 String unique_id = this.generateHash(filepath);
107 // - we start with the first 4 characters
108 int offset = 0;
109 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
110 // - we add ".dir" as a suffix to the directory that actually contains
111 // files (so the non-suffixed version contains nested directories)
112 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
113 // - then we continue adding blocks of 4 characters until we get a
114 // directory that doesn't already exist
115 while (assoc_path.toFile().exists() && offset < unique_id.length())
116 {
117 offset += 4;
118 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
119 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
120 }
121 // - still not unique? but run out of unique_id... time to complain
122 if (assoc_path.toFile().exists())
123 {
124 logger.error("ImageDoument - can't determine unique assocfilepath");
125 System.exit(0);
126 }
127 // - create the directories quick... hopefully before someone else does
128 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
129 this.properties.put("assocfile", assoc_filename);
130
131 // Copy (symlink) the file into place in the shared directory
132 Path source_path = Paths.get(properties.get("filename"));
133 Path target_path = assoc_path.resolve(target_filename);
134 if (target_path.toFile().exists())
135 {
136 if (this.debug)
137 {
138 logger.info("AudioDocument - removing existing (old) associated image");
139 }
140 try
141 {
142 Files.delete(target_path);
143 }
144 catch (Exception e)
145 {
146 logger.error("Exception while deleting old image: ", e);
147 }
148 }
149 if (this.debug)
150 {
151 logger.info("AudioDocument - symlinking image into assoc directory");
152 }
153 try
154 {
155 Files.createSymbolicLink(target_path, source_path);
156 }
157 // not supported? We'll try copying below
158 catch (UnsupportedOperationException ex)
159 {
160 }
161 // All other exceptions can be fatal
162 catch (Exception e)
163 {
164 logger.error("Exception while symlinking image: ", e);
165 }
166 // - copy if the file doesn't exist yet
167 if (!target_path.toFile().exists())
168 {
169 if (this.debug)
170 {
171 logger.info("ImageDocument - symlink failed, copying instead");
172 }
173 try
174 {
175 Files.copy(source_path, target_path);
176 }
177 // Fatality!
178 catch (Exception e)
179 {
180 logger.error("Exception while copying image: ", e);
181 }
182 }
183
184 // Generate streaming audio version
185 if (this.debug)
186 {
187 logger.info("AudioDocument - generate streaming version");
188 }
189 try
190 {
191 String streaming_filename = this.generatePreview(source_path, assoc_path);
192 this.properties.put("streaming", streaming_filename);
193 }
194 catch (Exception e)
195 {
196 logger.error("Exception while generating preview image: ", e);
197 }
198
199 // Extracting Fast Fourier Transform features
200 if (this.calculate_fft_features.equals("true"))
201 {
202 if (this.debug)
203 {
204 logger.info("AudioDocument - generate and record FFT features");
205 }
206 try
207 {
208 String chroma_filename = this.calculateFFTFeatures(source_path, assoc_path, "chroma");
209 this.properties.put("fft-chroma", chroma_filename);
210 String powerlog_filename = this.calculateFFTFeatures(source_path, assoc_path, "power-log");
211 this.properties.put("fft-power-log", powerlog_filename);
212 }
213 catch (Exception e)
214 {
215 logger.error("Exception while generating preview image: ", e);
216 }
217 }
218
219 // Extracting Music IR features
220 if (this.calculate_mir_features.equals("true"))
221 {
222 if (this.debug)
223 {
224 logger.info("AudioDocument - generate and record MIR features");
225 }
226 try
227 {
228 String mir_filename = this.calculateMIRFeatures(source_path, assoc_path);
229 }
230 catch (Exception e)
231 {
232 logger.error("Exception while generating preview image: ", e);
233 }
234 }
235
236 // Create a dummy reader around some dummy text and then tokenize it
237 if (this.debug)
238 {
239 logger.info("AudioDocument - feed dummy text as token stream to indexer");
240 }
241 try
242 {
243 this.reader = new StringReader(this.properties.get("abstract"));
244 this.tokenizer = tok.tokenise(this.reader);
245 }
246 catch (Exception e)
247 {
248 logger.error("Exception while creating dummy text stream: ", e);
249 }
250
251 if (this.debug)
252 {
253 logger.info("AudioDocument - Complete!");
254 }
255 }
256 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
257
258 /** Returns true when the end of the document has been reached, and there
259 * are no other terms to be retrieved from it.
260 * @return boolean true if there are no more terms in the document, otherwise
261 * it returns false.
262 */
263 public boolean endOfDocument()
264 {
265 return !this.tokenizer.hasNext();
266 }
267 /** endOfDocument() **/
268
269 /** Use ImageMagick to generate a preview image.
270 * @pre assumes you have ImageMagick installed and available on Path
271 * @pre uses member variables preview_format and preview_width
272 * @return the filename of the preview image (within the assoc directory)
273 */
274 private String generatePreview(Path source_path, Path assoc_path)
275 throws Exception
276 {
277 String preview_filename = "preview." + this.preview_format;
278 Path preview_path = assoc_path.resolve(preview_filename);
279 String convert_command[] = {
280 "convert",
281 source_path.toString(),
282 "-resize",
283 this.preview_width + "x",
284 preview_path.toString()
285 };
286 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
287 Process convert_process = Runtime.getRuntime().exec(convert_command);
288 // Gobble up the streams to prevent them hanging the process when buffers
289 // are full
290 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
291 convert_process_error_gobbler.start();
292 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
293 convert_process_input_gobbler.start();
294 // Let the conversion finish
295 int convert_status = convert_process.waitFor();
296 if (convert_status != 0 || !preview_path.toFile().exists())
297 {
298 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
299 }
300 return preview_filename;
301 }
302 /** generatePreview(Path, Path) **/
303
304 /** Returns the underlying map of all the properties defined by this Document.
305 * @since 1.1.0
306 */
307 public Map<String,String> getAllProperties()
308 {
309 return this.properties;
310 }
311 /** getAllProperties() **/
312
313 /** Returns a list of the fields the current term appears in.
314 * @return HashSet a set of the terms that the current term appears in.
315 */
316 public Set<String> getFields()
317 {
318 // Returns null because there is no support for fields with file documents.
319 return Collections.emptySet();
320 }
321 /** getFields() **/
322
323 /** Gets the next term of the document.
324 * <B>NB:</B>Null string returned from getNextTerm() should
325 * be ignored. They do not signify the lack of any more terms.
326 * endOfDocument() should be used to check that.
327 * @return String the next term of the document. Null returns should be
328 * ignored.
329 */
330 public String getNextTerm()
331 {
332 return this.tokenizer.next();
333 }
334 /** getNextTerm() **/
335
336 /** Allows access to a named property of the Document. Examples might be URL,
337 * filename etc.
338 * @param name Name of the property. It is suggested, but not required that
339 * this name should not be case insensitive.
340 * @since 1.1.0
341 */
342 public String getProperty(String name)
343 {
344 return this.properties.get(name.toLowerCase());
345 }
346 /** getProperty(String name) **/
347
348 /** Returns a Reader object so client code can tokenise the document
349 * or deal with the document itself. Examples might be extracting URLs,
350 * language detection. */
351 public Reader getReader()
352 {
353 return this.reader;
354 }
355 /** getReader() **/
356
357 /**
358 */
359 private String generateHash(String string)
360 {
361 StringBuffer sb = new StringBuffer();
362 try
363 {
364 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
365 message_digest.reset();
366 message_digest.update(string.getBytes(Charset.forName("UTF8")));
367 final byte[] result_bytes = message_digest.digest();
368 for (int i = 0; i < result_bytes.length; ++i)
369 {
370 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
371 }
372 }
373 catch (NoSuchAlgorithmException e)
374 {
375 System.err.println("Exception: " + e);
376 System.exit(0);
377 }
378 return sb.toString();
379 }
380 /** generateHash(String) **/
381}
382
Note: See TracBrowser for help on using the repository browser.