source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26186

Last change on this file since 26186 was 26186, checked in by jmt12, 12 years ago

Adding in (optional) support for video and image processing in DSpace and Terrier. These kinda belong here as they depend on the video-and-audio support (like MediaInfo, HandbrakeCLI, and Hive2) to work

File size: 15.3 KB
Line 
1/**
2 * Adding support for Videos in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedOutputStream;
20import java.io.BufferedReader;
21import java.io.File;
22import java.io.FileOutputStream;
23import java.io.InputStream;
24import java.io.InputStreamReader;
25import java.io.IOException;
26import java.io.PrintWriter;
27import java.io.StringReader;
28import java.io.Reader;
29import java.lang.Thread;
30import java.nio.file.Files;
31import java.nio.file.FileVisitResult;
32import static java.nio.file.FileVisitResult.*;
33import java.nio.file.Path;
34import java.nio.file.Paths;
35import java.nio.file.SimpleFileVisitor;
36import java.nio.file.attribute.BasicFileAttributes;
37import java.util.Collections;
38import java.util.Arrays;
39import java.util.Map;
40import java.util.Set;
41import java.util.regex.Matcher;
42import java.util.regex.Pattern;
43
44import org.apache.log4j.Logger;
45import org.terrier.indexing.tokenisation.TokenStream;
46import org.terrier.indexing.tokenisation.Tokeniser;
47import org.terrier.utility.ApplicationSetup;
48
49public class VideoDocument
50 implements Document
51{
52 /** A reference to the logger for messaging */
53 protected static final Logger logger = Logger.getLogger(FileDocument.class);
54 /** The map of properties (fields) for this document. */
55 protected Map<String,String> properties;
56 /** A reader built from a dummy text string. */
57 protected Reader reader;
58 /** A token stream produced by the configured tokeniser when feed the dummy
59 * reader.
60 */
61 protected TokenStream tokenizer;
62
63 // Handbrake Configuration
64 protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
65 protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
66 protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
67
68 /** The preview size (width). **/
69 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
70
71
72 /** Default constructor. **/
73 protected VideoDocument() {}
74
75 /** Constructs an instance of the ImageDocument from the given input stream.
76 * @param docStream the input stream that reads the file.
77 * @param docProperties the initial properties (docno, filename)
78 * @param tok the tokeniser defined for this collection
79 */
80 public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
81 {
82 logger.info("VideoDocument::VideoDocument()");
83 // 0. Initialization from arguments
84 this.properties = default_properties;
85
86 // Set properties
87 logger.info("VideoDocument - extracting properties");
88 // A. Hardcoded properties
89 this.properties.put("parser", "VideoDocument");
90 this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
91 // B. Properties derived from filename
92 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
93 this.properties.put("title", title);
94 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
95 String target_filename = "doc." + ext;
96 this.properties.put("source","doc." + ext);
97 String assoc_filename = "D" + properties.get("docno");
98 this.properties.put("assocfile", assoc_filename);
99
100 // Copy (symlink) the file into place in the shared directory
101 Path raw_video_path = Paths.get(properties.get("filename"));
102 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
103 // - if the assoc path already exists, we need to recursively delete it and
104 // its contents
105 if (Files.exists(assoc_path))
106 {
107 logger.info("VideoDocument - removing existing (old) associated files");
108 try
109 {
110 Files.walkFileTree(assoc_path, new SimpleFileVisitor<Path>()
111 {
112 @Override
113 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
114 throws IOException
115 {
116 ///ystem.out.println("Deleting file: " + file);
117 Files.delete(file);
118 return CONTINUE;
119 }
120 @Override
121 public FileVisitResult postVisitDirectory(Path dir, IOException exc)
122 throws IOException
123 {
124 ///ystem.out.println("Deleting dir: " + dir);
125 if (exc == null)
126 {
127 Files.delete(dir);
128 return CONTINUE;
129 }
130 else
131 {
132 throw exc;
133 }
134 }
135 });
136 }
137 catch (Exception e)
138 {
139 logger.error("Exception while recursively deleting assoc folder:", e);
140 }
141 }
142 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
143 Path target_path = assoc_path.resolve(target_filename);
144 logger.info("VideoDocument - symlinking original video into assoc directory");
145 try
146 {
147 Files.createSymbolicLink(target_path, raw_video_path);
148 }
149 // not supported? We'll try copying below
150 catch (UnsupportedOperationException ex)
151 {
152 }
153 // All other exceptions can be fatal
154 catch (Exception e)
155 {
156 logger.error("Exception while symlinking video: ", e);
157 }
158 // - copy if the file doesn't exist yet
159 if (!target_path.toFile().exists())
160 {
161 logger.info("VideoDocument - symlink failed, copying instead");
162 try
163 {
164 Files.copy(raw_video_path, target_path);
165 }
166 // Fatality!
167 catch (Exception e)
168 {
169 logger.error("Exception while copying video: ", e);
170 }
171 }
172
173 // 1. Extract Metadata using MediaInfo and store as properties
174 logger.info("VideoDocument - extracting video metadata");
175 try
176 {
177 String metadata_command[] = {
178 "mediainfo",
179 "--Output=XML",
180 raw_video_path.toString()
181 };
182 logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
183 Process metadata_process = Runtime.getRuntime().exec(metadata_command);
184 StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
185 metadata_process_error_gobbler.start();
186 BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
187 String line;
188 String type = "Unknown";
189 Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
190 Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
191 while ((line = metadata_br.readLine()) != null)
192 {
193 Matcher type_matcher = type_pattern.matcher(line);
194 if (type_matcher.matches())
195 {
196 type = type_matcher.group(1);
197 }
198 else
199 {
200 Matcher metadata_matcher = metadata_pattern.matcher(line);
201 if (metadata_matcher.matches())
202 {
203 String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
204 String value = metadata_matcher.group(2);
205 logger.info("VideoDocument - adding metadata: " + field + " => " + value);
206 this.properties.put(field, value);
207 }
208 }
209 }
210 int metadata_status = metadata_process.waitFor();
211 }
212 catch (Exception e)
213 {
214 logger.error("Exception while extracting video metadata:", e);
215 }
216
217 // 2. Convert Video to streamable format using HandbrakeCLI
218 logger.info("VideoDocument - convert video to streamable format");
219 Path converted_video_path = assoc_path.resolve("tsv.mp4");
220 try
221 {
222 String convert_command[] = {
223 "HandBrakeCLI",
224 "-i", raw_video_path.toString(),
225 "-t", "1",
226 "-c", "1",
227 "-o", converted_video_path.toString(),
228 "-f", "mp4",
229 "-O",
230 "-w", this.streaming_hq_size,
231 "--loose-anamorphic",
232 "-e", "x264",
233 "-b", this.streaming_hq_video_bitrate,
234 "-a", "1",
235 "-E", "faac",
236 "-6", "dpl2",
237 "-R", "Auto",
238 "-B", this.streaming_hq_audio_bitrate,
239 "-D", "0.0",
240 "-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
241 };
242 logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
243 // @todo determine the best way to account for configuration options
244 Process convert_process = Runtime.getRuntime().exec(convert_command);
245 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
246 convert_process_error_gobbler.start();
247 StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
248 convert_process_out_gobbler.start();
249 int convert_status = convert_process.waitFor();
250 if (convert_status != 0 || !Files.exists(converted_video_path))
251 {
252 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
253 }
254 }
255 catch (Exception e)
256 {
257 logger.error("Exception while converting video to streamable format: ", e);
258 }
259
260 // 3. Generate keyframes from streamable video and attach the shot names
261 // as a property
262 logger.info("VideoDocument - extracting keyframes from video");
263 try
264 {
265 Path shots_path = assoc_path.resolve("shots.xml");
266 String keyframe_command[] = {
267 "hive2_ffmpegsvn",
268 "-o", shots_path.toString(),
269 "-k", assoc_path.toString(),
270 "-m", "0.5",
271 "-l", "0.05",
272 converted_video_path.toString()
273 };
274 logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
275 Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
276 //Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
277 StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
278 keyframe_error_gobbler.start();
279 //Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
280 StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
281 keyframe_out_gobbler.start();
282 int keyframe_status = keyframe_process.waitFor();
283 if (keyframe_status != 0 || !Files.exists(shots_path))
284 {
285 throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
286 }
287 File files[] = assoc_path.toFile().listFiles();
288 Arrays.sort(files);
289 Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
290 String keyframe_images = "";
291 for (int i = 0; i < files.length; i++)
292 {
293 String image_filename = files[i].toPath().getFileName().toString();
294 logger.info("VideoDocument - considering keyframe image: " + image_filename);
295 Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
296 if (image_filename_matcher.matches())
297 {
298 if (keyframe_images.equals(""))
299 {
300 keyframe_images = image_filename;
301 }
302 else
303 {
304 keyframe_images += "," + image_filename;
305 }
306 }
307 }
308 this.properties.put("preview", keyframe_images);
309 }
310 catch (Exception e)
311 {
312 logger.error("Exception while extracting keyframes from video: ", e);
313 }
314
315 // 4. Create a dummy reader around some dummy text and then tokenize it
316 logger.info("VideoDocument - feed dummy text as token stream to indexer");
317 try
318 {
319 this.reader = new StringReader(this.properties.get("abstract"));
320 this.tokenizer = tok.tokenise(this.reader);
321 }
322 catch (Exception e)
323 {
324 logger.error("Exception while creating dummy text stream: ", e);
325 }
326 logger.info("VideoDocument - Complete!");
327 }
328 /** VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
329
330 /** Returns true when the end of the document has been reached, and there
331 * are no other terms to be retrieved from it.
332 * @return boolean true if there are no more terms in the document, otherwise
333 * it returns false.
334 */
335 public boolean endOfDocument()
336 {
337 return !this.tokenizer.hasNext();
338 }
339 /** endOfDocument() **/
340
341 /** Returns the underlying map of all the properties defined by this Document.
342 * @since 1.1.0
343 */
344 public Map<String,String> getAllProperties()
345 {
346 return this.properties;
347 }
348 /** getAllProperties() **/
349
350 /** Returns a list of the fields the current term appears in.
351 * @return HashSet a set of the terms that the current term appears in.
352 */
353 public Set<String> getFields()
354 {
355 // Returns null because there is no support for fields with file documents.
356 return Collections.emptySet();
357 }
358 /** getFields() **/
359
360 /** Gets the next term of the document.
361 * <B>NB:</B>Null string returned from getNextTerm() should
362 * be ignored. They do not signify the lack of any more terms.
363 * endOfDocument() should be used to check that.
364 * @return String the next term of the document. Null returns should be
365 * ignored.
366 */
367 public String getNextTerm()
368 {
369 return this.tokenizer.next();
370 }
371 /** getNextTerm() **/
372
373 /** Allows access to a named property of the Document. Examples might be URL,
374 * filename etc.
375 * @param name Name of the property. It is suggested, but not required that
376 * this name should not be case insensitive.
377 * @since 1.1.0
378 */
379 public String getProperty(String name)
380 {
381 return this.properties.get(name.toLowerCase());
382 }
383 /** getProperty(String name) **/
384
385 /** Returns a Reader object so client code can tokenise the document
386 * or deal with the document itself. Examples might be extracting URLs,
387 * language detection. */
388 public Reader getReader()
389 {
390 return this.reader;
391 }
392 /** getReader() **/
393}
394
395class StreamGobbler
396extends Thread
397{
398 InputStream is;
399 String file_path;
400 boolean output_to_file;
401
402 StreamGobbler(InputStream is)
403 {
404 this.is = is;
405 this.output_to_file = false;
406 }
407
408 StreamGobbler(InputStream is, String file_path)
409 {
410 this.is = is;
411 this.file_path = file_path;
412 this.output_to_file = true;
413 }
414
415 public void run()
416 {
417 try
418 {
419 InputStreamReader isr = new InputStreamReader(is);
420 BufferedReader br = new BufferedReader(isr);
421 String line = null;
422 if (output_to_file)
423 {
424 PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(file_path)));
425 while ( (line = br.readLine()) != null)
426 {
427 pw.println(line);
428 }
429 pw.flush();
430 pw.close();
431 }
432 else
433 {
434 while ( (line = br.readLine()) != null)
435 {
436 // Do nothing - equivalent to > /dev/null
437 }
438 }
439 }
440 catch (IOException ioe)
441 {
442 ioe.printStackTrace();
443 }
444 }
445}
446
Note: See TracBrowser for help on using the repository browser.