source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26241

Last change on this file since 26241 was 26241, checked in by jmt12, 12 years ago

Modifications to progress messages to improve extracting information from the logs in an automated fashion

File size: 16.2 KB
Line 
1/**
2 * Adding support for Videos in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedReader;
20import java.io.File;
21import java.io.InputStream;
22import java.io.InputStreamReader;
23import java.io.IOException;
24import java.io.StringReader;
25import java.io.Reader;
26import java.nio.charset.Charset;
27import java.nio.file.Files;
28import java.nio.file.Path;
29import java.nio.file.Paths;
30import java.nio.file.SimpleFileVisitor;
31import java.nio.file.attribute.BasicFileAttributes;
32import java.security.MessageDigest;
33import java.security.NoSuchAlgorithmException;
34import java.util.Collections;
35import java.util.Arrays;
36import java.util.Map;
37import java.util.Set;
38import java.util.regex.Matcher;
39import java.util.regex.Pattern;
40
41import org.apache.log4j.Logger;
42import org.terrier.indexing.StreamGobbler;
43import org.terrier.indexing.tokenisation.TokenStream;
44import org.terrier.indexing.tokenisation.Tokeniser;
45import org.terrier.utility.ApplicationSetup;
46
47public class VideoDocument
48 implements Document
49{
50 /** A reference to the logger for messaging */
51 protected static final Logger logger = Logger.getLogger(FileDocument.class);
52 /** The map of properties (fields) for this document. */
53 protected Map<String,String> properties;
54 /** A reader built from a dummy text string. */
55 protected Reader reader;
56 /** A token stream produced by the configured tokeniser when feed the dummy
57 * reader.
58 */
59 protected TokenStream tokenizer;
60
61 // Handbrake Configuration
62 protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
63 protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
64 protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
65
66 /** The preview size (width). **/
67 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
68
69 protected int max_keyframe_images_length = 1024;
70
71 /** Default constructor. **/
72 protected VideoDocument() {}
73
74 /** Constructs an instance of the ImageDocument from the given input stream.
75 * @param docStream the input stream that reads the file.
76 * @param docProperties the initial properties (docno, filename)
77 * @param tok the tokeniser defined for this collection
78 */
79 public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
80 {
81 logger.info("VideoDocument::VideoDocument()");
82 // 0. Initialization from arguments
83 this.properties = default_properties;
84 String filepath = this.properties.get("filename");
85 System.out.println("[F:" + this.epochTime() + "] Starting ingest of " + filepath);
86
87 // Set properties
88 logger.info("VideoDocument - extracting properties");
89 // A. Hardcoded properties
90 this.properties.put("parser", "VideoDocument");
91 this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
92 // B. Properties derived from filename
93 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
94 this.properties.put("title", title);
95 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
96 String target_filename = "doc." + ext;
97 this.properties.put("source","doc." + ext);
98
99 // - A unique associated directory. This gets a little tricky as we need
100 // to create the directory at the same time if an effort to promote
101 // synchronous behaviour
102 System.out.println("[F:" + this.epochTime() + ":PR]");
103 String unique_id = this.generateHash(filepath);
104 // - we start with the first 4 characters
105 int offset = 0;
106 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
107 // - we add ".dir" as a suffix to the directory that actually contains
108 // files (so the non-suffixed version contains nested directories)
109 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
110 // - then we continue adding blocks of 4 characters until we get a
111 // directory that doesn't already exist
112 System.out.println("[F:" + this.epochTime() + ":IO]");
113 while (assoc_path.toFile().exists() && offset < unique_id.length())
114 {
115 offset += 4;
116 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
117 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
118 }
119 // - still not unique? but run out of unique_id... time to complain
120 if (assoc_path.toFile().exists())
121 {
122 logger.error("ImageDoument - can't determine unique assocfilepath");
123 System.exit(0);
124 }
125 // - create the directories quick... hopefully before someone else does
126 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
127 this.properties.put("assocfile", assoc_filename);
128
129 // Copy (symlink) the file into place in the shared directory
130 Path raw_video_path = Paths.get(properties.get("filename"));
131 Path target_path = assoc_path.resolve(target_filename);
132 logger.info("VideoDocument - symlinking original video into assoc directory [IO]");
133 logger.info("[DEBUG] Raw Video Path: " + raw_video_path);
134 logger.info("[DEBUG] Target Path: " + target_path);
135 try
136 {
137 Files.createSymbolicLink(target_path, raw_video_path);
138 }
139 // not supported? We'll try copying below
140 catch (UnsupportedOperationException ex)
141 {
142 logger.warn("Symlinking not supported");
143 }
144 // All other exceptions can be fatal
145 catch (Exception e)
146 {
147 logger.error("Exception while symlinking video: ", e);
148 }
149 // - copy if the file doesn't exist yet
150 if (Files.notExists(target_path))
151 {
152 logger.info("VideoDocument - symlink failed, copying instead [IO]");
153 try
154 {
155 Files.copy(raw_video_path, target_path);
156 }
157 // Fatality!
158 catch (Exception e)
159 {
160 logger.error("Exception while copying video: ", e);
161 }
162 }
163
164 // 1. Extract Metadata using MediaInfo and store as properties
165 System.out.println("[F:" + this.epochTime() + ":PR]");
166 logger.info("VideoDocument - extracting video metadata [PR]");
167 try
168 {
169 String metadata_command[] = {
170 "mediainfo",
171 "--Output=XML",
172 raw_video_path.toString()
173 };
174 logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
175 Process metadata_process = Runtime.getRuntime().exec(metadata_command);
176 StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
177 metadata_process_error_gobbler.start();
178 BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
179 String line;
180 String type = "Unknown";
181 Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
182 Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
183 while ((line = metadata_br.readLine()) != null)
184 {
185 Matcher type_matcher = type_pattern.matcher(line);
186 if (type_matcher.matches())
187 {
188 type = type_matcher.group(1);
189 }
190 else
191 {
192 Matcher metadata_matcher = metadata_pattern.matcher(line);
193 if (metadata_matcher.matches())
194 {
195 String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
196 String value = metadata_matcher.group(2);
197 logger.info("VideoDocument - adding metadata: " + field + " => " + value);
198 this.properties.put(field, value);
199 }
200 }
201 }
202 int metadata_status = metadata_process.waitFor();
203 }
204 catch (Exception e)
205 {
206 logger.error("Exception while extracting video metadata:", e);
207 }
208
209 // 2. Convert Video to streamable format using HandbrakeCLI
210 logger.info("VideoDocument - convert video to streamable format [PR]");
211 Path converted_video_path = assoc_path.resolve("tsv.mp4");
212 try
213 {
214 String convert_command[] = {
215 "HandBrakeCLI",
216 "-i", raw_video_path.toString(),
217 "-t", "1",
218 "-c", "1",
219 "-o", converted_video_path.toString(),
220 "-f", "mp4",
221 "-O",
222 "-w", this.streaming_hq_size,
223 "--loose-anamorphic",
224 "-e", "x264",
225 "-b", this.streaming_hq_video_bitrate,
226 "-a", "1",
227 "-E", "faac",
228 "-6", "dpl2",
229 "-R", "Auto",
230 "-B", this.streaming_hq_audio_bitrate,
231 "-D", "0.0",
232 "-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
233 };
234 logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
235 // @todo determine the best way to account for configuration options
236 Process convert_process = Runtime.getRuntime().exec(convert_command);
237 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
238 convert_process_error_gobbler.start();
239 StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
240 convert_process_out_gobbler.start();
241 int convert_status = convert_process.waitFor();
242 if (convert_status != 0 || !Files.exists(converted_video_path))
243 {
244 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
245 }
246 }
247 catch (Exception e)
248 {
249 logger.error("Exception while converting video to streamable format: ", e);
250 }
251
252 // 3. Generate keyframes from streamable video and attach the shot names
253 // as a property
254 logger.info("VideoDocument - extracting keyframes from video [PR]");
255 try
256 {
257 Path shots_path = assoc_path.resolve("shots.xml");
258 String keyframe_command[] = {
259 "hive2_ffmpegsvn",
260 "-o", shots_path.toString(),
261 "-k", assoc_path.toString(),
262 "-m", "0.5",
263 "-l", "0.05",
264 converted_video_path.toString()
265 };
266 logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
267 Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
268 //Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
269 StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
270 keyframe_error_gobbler.start();
271 //Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
272 StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
273 keyframe_out_gobbler.start();
274 int keyframe_status = keyframe_process.waitFor();
275 if (keyframe_status != 0 || !Files.exists(shots_path))
276 {
277 throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
278 }
279
280 System.out.println("[F:" + this.epochTime() + ":IO]");
281 logger.info("VideoDocument - associating keyframes to video [IO]");
282 File files[] = assoc_path.toFile().listFiles();
283 Arrays.sort(files);
284 Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
285 String keyframe_images = "";
286 for (int i = 0; i < files.length; i++)
287 {
288 String image_filename = files[i].toPath().getFileName().toString();
289 logger.info("VideoDocument - considering keyframe image: " + image_filename);
290 Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
291 if (image_filename_matcher.matches())
292 {
293 if (keyframe_images.equals(""))
294 {
295 keyframe_images = image_filename;
296 }
297 else
298 {
299 // Consider the maximum size of the preview images field
300 if ((keyframe_images.length() + image_filename.length() + 1) < this.max_keyframe_images_length)
301 {
302 keyframe_images += "," + image_filename;
303 }
304 // Break out of loop
305 else
306 {
307 i = files.length;
308 }
309 }
310 }
311 }
312 this.properties.put("preview", keyframe_images);
313 }
314 catch (Exception e)
315 {
316 logger.error("Exception while extracting keyframes from video: ", e);
317 }
318
319 // 4. Create a dummy reader around some dummy text and then tokenize it
320 System.out.println("[F:" + this.epochTime() + ":PR]");
321 logger.info("VideoDocument - feed dummy text as token stream to indexer [PR]");
322 try
323 {
324 this.reader = new StringReader(this.properties.get("abstract"));
325 this.tokenizer = tok.tokenise(this.reader);
326 }
327 catch (Exception e)
328 {
329 logger.error("Exception while creating dummy text stream: ", e);
330 }
331 logger.info("VideoDocument - Complete!");
332 System.out.println("[F:" + this.epochTime() + "] Complete");
333 }
334 /** VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
335
336 /** Returns true when the end of the document has been reached, and there
337 * are no other terms to be retrieved from it.
338 * @return boolean true if there are no more terms in the document, otherwise
339 * it returns false.
340 */
341 public boolean endOfDocument()
342 {
343 return !this.tokenizer.hasNext();
344 }
345 /** endOfDocument() **/
346
347 /** @function epochTime
348 * Returns the current time in seconds since 1970JAN01
349 */
350 public long epochTime()
351 {
352 return System.currentTimeMillis()/1000;
353 }
354 /** epochTime() **/
355
356 /** Returns the underlying map of all the properties defined by this Document.
357 * @since 1.1.0
358 */
359 public Map<String,String> getAllProperties()
360 {
361 return this.properties;
362 }
363 /** getAllProperties() **/
364
365 /** Returns a list of the fields the current term appears in.
366 * @return HashSet a set of the terms that the current term appears in.
367 */
368 public Set<String> getFields()
369 {
370 // Returns null because there is no support for fields with file documents.
371 return Collections.emptySet();
372 }
373 /** getFields() **/
374
375 /** Gets the next term of the document.
376 * <B>NB:</B>Null string returned from getNextTerm() should
377 * be ignored. They do not signify the lack of any more terms.
378 * endOfDocument() should be used to check that.
379 * @return String the next term of the document. Null returns should be
380 * ignored.
381 */
382 public String getNextTerm()
383 {
384 return this.tokenizer.next();
385 }
386 /** getNextTerm() **/
387
388 /** Allows access to a named property of the Document. Examples might be URL,
389 * filename etc.
390 * @param name Name of the property. It is suggested, but not required that
391 * this name should not be case insensitive.
392 * @since 1.1.0
393 */
394 public String getProperty(String name)
395 {
396 return this.properties.get(name.toLowerCase());
397 }
398 /** getProperty(String name) **/
399
400 /** Returns a Reader object so client code can tokenise the document
401 * or deal with the document itself. Examples might be extracting URLs,
402 * language detection. */
403 public Reader getReader()
404 {
405 return this.reader;
406 }
407 /** getReader() **/
408
409 /**
410 */
411 private String generateHash(String string)
412 {
413 StringBuffer sb = new StringBuffer();
414 try
415 {
416 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
417 message_digest.reset();
418 message_digest.update(string.getBytes(Charset.forName("UTF8")));
419 final byte[] result_bytes = message_digest.digest();
420 for (int i = 0; i < result_bytes.length; ++i)
421 {
422 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
423 }
424 }
425 catch (NoSuchAlgorithmException e)
426 {
427 System.err.println("Exception: " + e);
428 System.exit(0);
429 }
430 return sb.toString();
431 }
432 /** generateHash(String) **/
433}
Note: See TracBrowser for help on using the repository browser.