source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26236

Last change on this file since 26236 was 26236, checked in by jmt12, 12 years ago

Limit the size of the keyframe preview field so that we can have a hardcoded lenght for the field in terrier.properties configuration without breaking things. I've started with maxlength=1024 - which should be around 80 thumbnails.

File size: 15.4 KB
Line 
1/**
2 * Adding support for Videos in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedReader;
20import java.io.File;
21import java.io.InputStream;
22import java.io.InputStreamReader;
23import java.io.IOException;
24import java.io.StringReader;
25import java.io.Reader;
26import java.nio.charset.Charset;
27import java.nio.file.Files;
28import java.nio.file.Path;
29import java.nio.file.Paths;
30import java.nio.file.SimpleFileVisitor;
31import java.nio.file.attribute.BasicFileAttributes;
32import java.security.MessageDigest;
33import java.security.NoSuchAlgorithmException;
34import java.util.Collections;
35import java.util.Arrays;
36import java.util.Map;
37import java.util.Set;
38import java.util.regex.Matcher;
39import java.util.regex.Pattern;
40
41import org.apache.log4j.Logger;
42import org.terrier.indexing.StreamGobbler;
43import org.terrier.indexing.tokenisation.TokenStream;
44import org.terrier.indexing.tokenisation.Tokeniser;
45import org.terrier.utility.ApplicationSetup;
46
47public class VideoDocument
48 implements Document
49{
50 /** A reference to the logger for messaging */
51 protected static final Logger logger = Logger.getLogger(FileDocument.class);
52 /** The map of properties (fields) for this document. */
53 protected Map<String,String> properties;
54 /** A reader built from a dummy text string. */
55 protected Reader reader;
56 /** A token stream produced by the configured tokeniser when feed the dummy
57 * reader.
58 */
59 protected TokenStream tokenizer;
60
61 // Handbrake Configuration
62 protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
63 protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
64 protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
65
66 /** The preview size (width). **/
67 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
68
69 protected int max_keyframe_images_length = 1024;
70
71 /** Default constructor. **/
72 protected VideoDocument() {}
73
74 /** Constructs an instance of the ImageDocument from the given input stream.
75 * @param docStream the input stream that reads the file.
76 * @param docProperties the initial properties (docno, filename)
77 * @param tok the tokeniser defined for this collection
78 */
79 public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
80 {
81 logger.info("VideoDocument::VideoDocument()");
82 // 0. Initialization from arguments
83 this.properties = default_properties;
84
85 // Set properties
86 logger.info("VideoDocument - extracting properties");
87 // A. Hardcoded properties
88 this.properties.put("parser", "VideoDocument");
89 this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
90 // B. Properties derived from filename
91 String filepath = this.properties.get("filename");
92 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
93 this.properties.put("title", title);
94 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
95 String target_filename = "doc." + ext;
96 this.properties.put("source","doc." + ext);
97 // - A unique associated directory. This gets a little tricky as we need
98 // to create the directory at the same time if an effort to promote
99 // synchronous behaviour
100 String unique_id = this.generateHash(filepath);
101 // - we start with the first 4 characters
102 int offset = 0;
103 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
104 // - we add ".dir" as a suffix to the directory that actually contains
105 // files (so the non-suffixed version contains nested directories)
106 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
107 // - then we continue adding blocks of 4 characters until we get a
108 // directory that doesn't already exist
109 while (assoc_path.toFile().exists() && offset < unique_id.length())
110 {
111 offset += 4;
112 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
113 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
114 }
115 // - still not unique? but run out of unique_id... time to complain
116 if (assoc_path.toFile().exists())
117 {
118 logger.error("ImageDoument - can't determine unique assocfilepath");
119 System.exit(0);
120 }
121 // - create the directories quick... hopefully before someone else does
122 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
123 this.properties.put("assocfile", assoc_filename);
124
125 // Copy (symlink) the file into place in the shared directory
126 Path raw_video_path = Paths.get(properties.get("filename"));
127 Path target_path = assoc_path.resolve(target_filename);
128 logger.info("VideoDocument - symlinking original video into assoc directory [IO]");
129 try
130 {
131 Files.createSymbolicLink(target_path, raw_video_path);
132 }
133 // not supported? We'll try copying below
134 catch (UnsupportedOperationException ex)
135 {
136 }
137 // All other exceptions can be fatal
138 catch (Exception e)
139 {
140 logger.error("Exception while symlinking video: ", e);
141 }
142 // - copy if the file doesn't exist yet
143 if (!target_path.toFile().exists())
144 {
145 logger.info("VideoDocument - symlink failed, copying instead [IO]");
146 try
147 {
148 Files.copy(raw_video_path, target_path);
149 }
150 // Fatality!
151 catch (Exception e)
152 {
153 logger.error("Exception while copying video: ", e);
154 }
155 }
156
157 // 1. Extract Metadata using MediaInfo and store as properties
158 logger.info("VideoDocument - extracting video metadata [PR]");
159 try
160 {
161 String metadata_command[] = {
162 "mediainfo",
163 "--Output=XML",
164 raw_video_path.toString()
165 };
166 logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
167 Process metadata_process = Runtime.getRuntime().exec(metadata_command);
168 StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
169 metadata_process_error_gobbler.start();
170 BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
171 String line;
172 String type = "Unknown";
173 Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
174 Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
175 while ((line = metadata_br.readLine()) != null)
176 {
177 Matcher type_matcher = type_pattern.matcher(line);
178 if (type_matcher.matches())
179 {
180 type = type_matcher.group(1);
181 }
182 else
183 {
184 Matcher metadata_matcher = metadata_pattern.matcher(line);
185 if (metadata_matcher.matches())
186 {
187 String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
188 String value = metadata_matcher.group(2);
189 logger.info("VideoDocument - adding metadata: " + field + " => " + value);
190 this.properties.put(field, value);
191 }
192 }
193 }
194 int metadata_status = metadata_process.waitFor();
195 }
196 catch (Exception e)
197 {
198 logger.error("Exception while extracting video metadata:", e);
199 }
200
201 // 2. Convert Video to streamable format using HandbrakeCLI
202 logger.info("VideoDocument - convert video to streamable format [PR]");
203 Path converted_video_path = assoc_path.resolve("tsv.mp4");
204 try
205 {
206 String convert_command[] = {
207 "HandBrakeCLI",
208 "-i", raw_video_path.toString(),
209 "-t", "1",
210 "-c", "1",
211 "-o", converted_video_path.toString(),
212 "-f", "mp4",
213 "-O",
214 "-w", this.streaming_hq_size,
215 "--loose-anamorphic",
216 "-e", "x264",
217 "-b", this.streaming_hq_video_bitrate,
218 "-a", "1",
219 "-E", "faac",
220 "-6", "dpl2",
221 "-R", "Auto",
222 "-B", this.streaming_hq_audio_bitrate,
223 "-D", "0.0",
224 "-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
225 };
226 logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
227 // @todo determine the best way to account for configuration options
228 Process convert_process = Runtime.getRuntime().exec(convert_command);
229 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
230 convert_process_error_gobbler.start();
231 StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
232 convert_process_out_gobbler.start();
233 int convert_status = convert_process.waitFor();
234 if (convert_status != 0 || !Files.exists(converted_video_path))
235 {
236 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
237 }
238 }
239 catch (Exception e)
240 {
241 logger.error("Exception while converting video to streamable format: ", e);
242 }
243
244 // 3. Generate keyframes from streamable video and attach the shot names
245 // as a property
246 logger.info("VideoDocument - extracting keyframes from video [PR]");
247 try
248 {
249 Path shots_path = assoc_path.resolve("shots.xml");
250 String keyframe_command[] = {
251 "hive2_ffmpegsvn",
252 "-o", shots_path.toString(),
253 "-k", assoc_path.toString(),
254 "-m", "0.5",
255 "-l", "0.05",
256 converted_video_path.toString()
257 };
258 logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
259 Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
260 //Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
261 StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
262 keyframe_error_gobbler.start();
263 //Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
264 StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
265 keyframe_out_gobbler.start();
266 int keyframe_status = keyframe_process.waitFor();
267 if (keyframe_status != 0 || !Files.exists(shots_path))
268 {
269 throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
270 }
271
272 logger.info("VideoDocument - associating keyframes to video [IO]");
273 File files[] = assoc_path.toFile().listFiles();
274 Arrays.sort(files);
275 Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
276 String keyframe_images = "";
277 for (int i = 0; i < files.length; i++)
278 {
279 String image_filename = files[i].toPath().getFileName().toString();
280 logger.info("VideoDocument - considering keyframe image: " + image_filename);
281 Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
282 if (image_filename_matcher.matches())
283 {
284 if (keyframe_images.equals(""))
285 {
286 keyframe_images = image_filename;
287 }
288 else
289 {
290 // Consider the maximum size of the preview images field
291 if ((keyframe_images.length() + image_filename.length() + 1) < this.max_keyframe_images_length)
292 {
293 keyframe_images += "," + image_filename;
294 }
295 // Break out of loop
296 else
297 {
298 i = files.length;
299 }
300 }
301 }
302 }
303 this.properties.put("preview", keyframe_images);
304 }
305 catch (Exception e)
306 {
307 logger.error("Exception while extracting keyframes from video: ", e);
308 }
309
310 // 4. Create a dummy reader around some dummy text and then tokenize it
311 logger.info("VideoDocument - feed dummy text as token stream to indexer");
312 try
313 {
314 this.reader = new StringReader(this.properties.get("abstract"));
315 this.tokenizer = tok.tokenise(this.reader);
316 }
317 catch (Exception e)
318 {
319 logger.error("Exception while creating dummy text stream: ", e);
320 }
321 logger.info("VideoDocument - Complete!");
322 }
323 /** VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
324
325 /** Returns true when the end of the document has been reached, and there
326 * are no other terms to be retrieved from it.
327 * @return boolean true if there are no more terms in the document, otherwise
328 * it returns false.
329 */
330 public boolean endOfDocument()
331 {
332 return !this.tokenizer.hasNext();
333 }
334 /** endOfDocument() **/
335
336 /** Returns the underlying map of all the properties defined by this Document.
337 * @since 1.1.0
338 */
339 public Map<String,String> getAllProperties()
340 {
341 return this.properties;
342 }
343 /** getAllProperties() **/
344
345 /** Returns a list of the fields the current term appears in.
346 * @return HashSet a set of the terms that the current term appears in.
347 */
348 public Set<String> getFields()
349 {
350 // Returns null because there is no support for fields with file documents.
351 return Collections.emptySet();
352 }
353 /** getFields() **/
354
355 /** Gets the next term of the document.
356 * <B>NB:</B>Null string returned from getNextTerm() should
357 * be ignored. They do not signify the lack of any more terms.
358 * endOfDocument() should be used to check that.
359 * @return String the next term of the document. Null returns should be
360 * ignored.
361 */
362 public String getNextTerm()
363 {
364 return this.tokenizer.next();
365 }
366 /** getNextTerm() **/
367
368 /** Allows access to a named property of the Document. Examples might be URL,
369 * filename etc.
370 * @param name Name of the property. It is suggested, but not required that
371 * this name should not be case insensitive.
372 * @since 1.1.0
373 */
374 public String getProperty(String name)
375 {
376 return this.properties.get(name.toLowerCase());
377 }
378 /** getProperty(String name) **/
379
380 /** Returns a Reader object so client code can tokenise the document
381 * or deal with the document itself. Examples might be extracting URLs,
382 * language detection. */
383 public Reader getReader()
384 {
385 return this.reader;
386 }
387 /** getReader() **/
388
389 /**
390 */
391 private String generateHash(String string)
392 {
393 StringBuffer sb = new StringBuffer();
394 try
395 {
396 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
397 message_digest.reset();
398 message_digest.update(string.getBytes(Charset.forName("UTF8")));
399 final byte[] result_bytes = message_digest.digest();
400 for (int i = 0; i < result_bytes.length; ++i)
401 {
402 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
403 }
404 }
405 catch (NoSuchAlgorithmException e)
406 {
407 System.err.println("Exception: " + e);
408 System.exit(0);
409 }
410 return sb.toString();
411 }
412 /** generateHash(String) **/
413}
Note: See TracBrowser for help on using the repository browser.