source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26190

Last change on this file since 26190 was 26190, checked in by jmt12, 12 years ago

Moving the StreamGobbler - used in both plugins to prevent a full STDERR buffer killing the import - into it's own class... my computer doesn't have an issue with exactly the same class occuring twice, but Medusa's one seems stricter in this regard

File size: 14.2 KB
Line 
1/**
2 * Adding support for Videos in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedReader;
20import java.io.File;
21import java.io.InputStream;
22import java.io.InputStreamReader;
23import java.io.IOException;
24import java.io.StringReader;
25import java.io.Reader;
26import java.lang.Thread;
27import java.nio.file.Files;
28import java.nio.file.FileVisitResult;
29import static java.nio.file.FileVisitResult.*;
30import java.nio.file.Path;
31import java.nio.file.Paths;
32import java.nio.file.SimpleFileVisitor;
33import java.nio.file.attribute.BasicFileAttributes;
34import java.util.Collections;
35import java.util.Arrays;
36import java.util.Map;
37import java.util.Set;
38import java.util.regex.Matcher;
39import java.util.regex.Pattern;
40
41import org.apache.log4j.Logger;
42import org.terrier.indexing.StreamGobbler;
43import org.terrier.indexing.tokenisation.TokenStream;
44import org.terrier.indexing.tokenisation.Tokeniser;
45import org.terrier.utility.ApplicationSetup;
46
47public class VideoDocument
48 implements Document
49{
50 /** A reference to the logger for messaging */
51 protected static final Logger logger = Logger.getLogger(FileDocument.class);
52 /** The map of properties (fields) for this document. */
53 protected Map<String,String> properties;
54 /** A reader built from a dummy text string. */
55 protected Reader reader;
56 /** A token stream produced by the configured tokeniser when feed the dummy
57 * reader.
58 */
59 protected TokenStream tokenizer;
60
61 // Handbrake Configuration
62 protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
63 protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
64 protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
65
66 /** The preview size (width). **/
67 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
68
69
70 /** Default constructor. **/
71 protected VideoDocument() {}
72
73 /** Constructs an instance of the ImageDocument from the given input stream.
74 * @param docStream the input stream that reads the file.
75 * @param docProperties the initial properties (docno, filename)
76 * @param tok the tokeniser defined for this collection
77 */
78 public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
79 {
80 logger.info("VideoDocument::VideoDocument()");
81 // 0. Initialization from arguments
82 this.properties = default_properties;
83
84 // Set properties
85 logger.info("VideoDocument - extracting properties");
86 // A. Hardcoded properties
87 this.properties.put("parser", "VideoDocument");
88 this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
89 // B. Properties derived from filename
90 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
91 this.properties.put("title", title);
92 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
93 String target_filename = "doc." + ext;
94 this.properties.put("source","doc." + ext);
95 String assoc_filename = "D" + properties.get("docno");
96 this.properties.put("assocfile", assoc_filename);
97
98 // Copy (symlink) the file into place in the shared directory
99 Path raw_video_path = Paths.get(properties.get("filename"));
100 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
101 // - if the assoc path already exists, we need to recursively delete it and
102 // its contents
103 if (Files.exists(assoc_path))
104 {
105 logger.info("VideoDocument - removing existing (old) associated files");
106 try
107 {
108 Files.walkFileTree(assoc_path, new SimpleFileVisitor<Path>()
109 {
110 @Override
111 public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
112 throws IOException
113 {
114 ///ystem.out.println("Deleting file: " + file);
115 Files.delete(file);
116 return CONTINUE;
117 }
118 @Override
119 public FileVisitResult postVisitDirectory(Path dir, IOException exc)
120 throws IOException
121 {
122 ///ystem.out.println("Deleting dir: " + dir);
123 if (exc == null)
124 {
125 Files.delete(dir);
126 return CONTINUE;
127 }
128 else
129 {
130 throw exc;
131 }
132 }
133 });
134 }
135 catch (Exception e)
136 {
137 logger.error("Exception while recursively deleting assoc folder:", e);
138 }
139 }
140 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
141 Path target_path = assoc_path.resolve(target_filename);
142 logger.info("VideoDocument - symlinking original video into assoc directory");
143 try
144 {
145 Files.createSymbolicLink(target_path, raw_video_path);
146 }
147 // not supported? We'll try copying below
148 catch (UnsupportedOperationException ex)
149 {
150 }
151 // All other exceptions can be fatal
152 catch (Exception e)
153 {
154 logger.error("Exception while symlinking video: ", e);
155 }
156 // - copy if the file doesn't exist yet
157 if (!target_path.toFile().exists())
158 {
159 logger.info("VideoDocument - symlink failed, copying instead");
160 try
161 {
162 Files.copy(raw_video_path, target_path);
163 }
164 // Fatality!
165 catch (Exception e)
166 {
167 logger.error("Exception while copying video: ", e);
168 }
169 }
170
171 // 1. Extract Metadata using MediaInfo and store as properties
172 logger.info("VideoDocument - extracting video metadata");
173 try
174 {
175 String metadata_command[] = {
176 "mediainfo",
177 "--Output=XML",
178 raw_video_path.toString()
179 };
180 logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
181 Process metadata_process = Runtime.getRuntime().exec(metadata_command);
182 StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
183 metadata_process_error_gobbler.start();
184 BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
185 String line;
186 String type = "Unknown";
187 Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
188 Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
189 while ((line = metadata_br.readLine()) != null)
190 {
191 Matcher type_matcher = type_pattern.matcher(line);
192 if (type_matcher.matches())
193 {
194 type = type_matcher.group(1);
195 }
196 else
197 {
198 Matcher metadata_matcher = metadata_pattern.matcher(line);
199 if (metadata_matcher.matches())
200 {
201 String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
202 String value = metadata_matcher.group(2);
203 logger.info("VideoDocument - adding metadata: " + field + " => " + value);
204 this.properties.put(field, value);
205 }
206 }
207 }
208 int metadata_status = metadata_process.waitFor();
209 }
210 catch (Exception e)
211 {
212 logger.error("Exception while extracting video metadata:", e);
213 }
214
215 // 2. Convert Video to streamable format using HandbrakeCLI
216 logger.info("VideoDocument - convert video to streamable format");
217 Path converted_video_path = assoc_path.resolve("tsv.mp4");
218 try
219 {
220 String convert_command[] = {
221 "HandBrakeCLI",
222 "-i", raw_video_path.toString(),
223 "-t", "1",
224 "-c", "1",
225 "-o", converted_video_path.toString(),
226 "-f", "mp4",
227 "-O",
228 "-w", this.streaming_hq_size,
229 "--loose-anamorphic",
230 "-e", "x264",
231 "-b", this.streaming_hq_video_bitrate,
232 "-a", "1",
233 "-E", "faac",
234 "-6", "dpl2",
235 "-R", "Auto",
236 "-B", this.streaming_hq_audio_bitrate,
237 "-D", "0.0",
238 "-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
239 };
240 logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
241 // @todo determine the best way to account for configuration options
242 Process convert_process = Runtime.getRuntime().exec(convert_command);
243 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
244 convert_process_error_gobbler.start();
245 StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
246 convert_process_out_gobbler.start();
247 int convert_status = convert_process.waitFor();
248 if (convert_status != 0 || !Files.exists(converted_video_path))
249 {
250 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
251 }
252 }
253 catch (Exception e)
254 {
255 logger.error("Exception while converting video to streamable format: ", e);
256 }
257
258 // 3. Generate keyframes from streamable video and attach the shot names
259 // as a property
260 logger.info("VideoDocument - extracting keyframes from video");
261 try
262 {
263 Path shots_path = assoc_path.resolve("shots.xml");
264 String keyframe_command[] = {
265 "hive2_ffmpegsvn",
266 "-o", shots_path.toString(),
267 "-k", assoc_path.toString(),
268 "-m", "0.5",
269 "-l", "0.05",
270 converted_video_path.toString()
271 };
272 logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
273 Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
274 //Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
275 StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
276 keyframe_error_gobbler.start();
277 //Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
278 StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
279 keyframe_out_gobbler.start();
280 int keyframe_status = keyframe_process.waitFor();
281 if (keyframe_status != 0 || !Files.exists(shots_path))
282 {
283 throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
284 }
285 File files[] = assoc_path.toFile().listFiles();
286 Arrays.sort(files);
287 Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
288 String keyframe_images = "";
289 for (int i = 0; i < files.length; i++)
290 {
291 String image_filename = files[i].toPath().getFileName().toString();
292 logger.info("VideoDocument - considering keyframe image: " + image_filename);
293 Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
294 if (image_filename_matcher.matches())
295 {
296 if (keyframe_images.equals(""))
297 {
298 keyframe_images = image_filename;
299 }
300 else
301 {
302 keyframe_images += "," + image_filename;
303 }
304 }
305 }
306 this.properties.put("preview", keyframe_images);
307 }
308 catch (Exception e)
309 {
310 logger.error("Exception while extracting keyframes from video: ", e);
311 }
312
313 // 4. Create a dummy reader around some dummy text and then tokenize it
314 logger.info("VideoDocument - feed dummy text as token stream to indexer");
315 try
316 {
317 this.reader = new StringReader(this.properties.get("abstract"));
318 this.tokenizer = tok.tokenise(this.reader);
319 }
320 catch (Exception e)
321 {
322 logger.error("Exception while creating dummy text stream: ", e);
323 }
324 logger.info("VideoDocument - Complete!");
325 }
326 /** VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
327
328 /** Returns true when the end of the document has been reached, and there
329 * are no other terms to be retrieved from it.
330 * @return boolean true if there are no more terms in the document, otherwise
331 * it returns false.
332 */
333 public boolean endOfDocument()
334 {
335 return !this.tokenizer.hasNext();
336 }
337 /** endOfDocument() **/
338
339 /** Returns the underlying map of all the properties defined by this Document.
340 * @since 1.1.0
341 */
342 public Map<String,String> getAllProperties()
343 {
344 return this.properties;
345 }
346 /** getAllProperties() **/
347
348 /** Returns a list of the fields the current term appears in.
349 * @return HashSet a set of the terms that the current term appears in.
350 */
351 public Set<String> getFields()
352 {
353 // Returns null because there is no support for fields with file documents.
354 return Collections.emptySet();
355 }
356 /** getFields() **/
357
358 /** Gets the next term of the document.
359 * <B>NB:</B>Null string returned from getNextTerm() should
360 * be ignored. They do not signify the lack of any more terms.
361 * endOfDocument() should be used to check that.
362 * @return String the next term of the document. Null returns should be
363 * ignored.
364 */
365 public String getNextTerm()
366 {
367 return this.tokenizer.next();
368 }
369 /** getNextTerm() **/
370
371 /** Allows access to a named property of the Document. Examples might be URL,
372 * filename etc.
373 * @param name Name of the property. It is suggested, but not required that
374 * this name should not be case insensitive.
375 * @since 1.1.0
376 */
377 public String getProperty(String name)
378 {
379 return this.properties.get(name.toLowerCase());
380 }
381 /** getProperty(String name) **/
382
383 /** Returns a Reader object so client code can tokenise the document
384 * or deal with the document itself. Examples might be extracting URLs,
385 * language detection. */
386 public Reader getReader()
387 {
388 return this.reader;
389 }
390 /** getReader() **/
391}
Note: See TracBrowser for help on using the repository browser.