source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26235

Last change on this file since 26235 was 26235, checked in by jmt12, 12 years ago

Extending debug comments with an indicator of whether this is IO time or processor (PR) time.

File size: 15.0 KB
Line 
1/**
2 * Adding support for Videos in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedReader;
20import java.io.File;
21import java.io.InputStream;
22import java.io.InputStreamReader;
23import java.io.IOException;
24import java.io.StringReader;
25import java.io.Reader;
26import java.nio.charset.Charset;
27import java.nio.file.Files;
28import java.nio.file.Path;
29import java.nio.file.Paths;
30import java.nio.file.SimpleFileVisitor;
31import java.nio.file.attribute.BasicFileAttributes;
32import java.security.MessageDigest;
33import java.security.NoSuchAlgorithmException;
34import java.util.Collections;
35import java.util.Arrays;
36import java.util.Map;
37import java.util.Set;
38import java.util.regex.Matcher;
39import java.util.regex.Pattern;
40
41import org.apache.log4j.Logger;
42import org.terrier.indexing.StreamGobbler;
43import org.terrier.indexing.tokenisation.TokenStream;
44import org.terrier.indexing.tokenisation.Tokeniser;
45import org.terrier.utility.ApplicationSetup;
46
47public class VideoDocument
48 implements Document
49{
50 /** A reference to the logger for messaging */
51 protected static final Logger logger = Logger.getLogger(FileDocument.class);
52 /** The map of properties (fields) for this document. */
53 protected Map<String,String> properties;
54 /** A reader built from a dummy text string. */
55 protected Reader reader;
56 /** A token stream produced by the configured tokeniser when feed the dummy
57 * reader.
58 */
59 protected TokenStream tokenizer;
60
61 // Handbrake Configuration
62 protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
63 protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
64 protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
65
66 /** The preview size (width). **/
67 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
68
69
70 /** Default constructor. **/
71 protected VideoDocument() {}
72
73 /** Constructs an instance of the ImageDocument from the given input stream.
74 * @param docStream the input stream that reads the file.
75 * @param docProperties the initial properties (docno, filename)
76 * @param tok the tokeniser defined for this collection
77 */
78 public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
79 {
80 logger.info("VideoDocument::VideoDocument()");
81 // 0. Initialization from arguments
82 this.properties = default_properties;
83
84 // Set properties
85 logger.info("VideoDocument - extracting properties");
86 // A. Hardcoded properties
87 this.properties.put("parser", "VideoDocument");
88 this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
89 // B. Properties derived from filename
90 String filepath = this.properties.get("filename");
91 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
92 this.properties.put("title", title);
93 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
94 String target_filename = "doc." + ext;
95 this.properties.put("source","doc." + ext);
96 // - A unique associated directory. This gets a little tricky as we need
97 // to create the directory at the same time if an effort to promote
98 // synchronous behaviour
99 String unique_id = this.generateHash(filepath);
100 // - we start with the first 4 characters
101 int offset = 0;
102 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
103 // - we add ".dir" as a suffix to the directory that actually contains
104 // files (so the non-suffixed version contains nested directories)
105 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
106 // - then we continue adding blocks of 4 characters until we get a
107 // directory that doesn't already exist
108 while (assoc_path.toFile().exists() && offset < unique_id.length())
109 {
110 offset += 4;
111 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
112 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
113 }
114 // - still not unique? but run out of unique_id... time to complain
115 if (assoc_path.toFile().exists())
116 {
117 logger.error("ImageDoument - can't determine unique assocfilepath");
118 System.exit(0);
119 }
120 // - create the directories quick... hopefully before someone else does
121 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
122 this.properties.put("assocfile", assoc_filename);
123
124 // Copy (symlink) the file into place in the shared directory
125 Path raw_video_path = Paths.get(properties.get("filename"));
126 Path target_path = assoc_path.resolve(target_filename);
127 logger.info("VideoDocument - symlinking original video into assoc directory [IO]");
128 try
129 {
130 Files.createSymbolicLink(target_path, raw_video_path);
131 }
132 // not supported? We'll try copying below
133 catch (UnsupportedOperationException ex)
134 {
135 }
136 // All other exceptions can be fatal
137 catch (Exception e)
138 {
139 logger.error("Exception while symlinking video: ", e);
140 }
141 // - copy if the file doesn't exist yet
142 if (!target_path.toFile().exists())
143 {
144 logger.info("VideoDocument - symlink failed, copying instead [IO]");
145 try
146 {
147 Files.copy(raw_video_path, target_path);
148 }
149 // Fatality!
150 catch (Exception e)
151 {
152 logger.error("Exception while copying video: ", e);
153 }
154 }
155
156 // 1. Extract Metadata using MediaInfo and store as properties
157 logger.info("VideoDocument - extracting video metadata [PR]");
158 try
159 {
160 String metadata_command[] = {
161 "mediainfo",
162 "--Output=XML",
163 raw_video_path.toString()
164 };
165 logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
166 Process metadata_process = Runtime.getRuntime().exec(metadata_command);
167 StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
168 metadata_process_error_gobbler.start();
169 BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
170 String line;
171 String type = "Unknown";
172 Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
173 Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
174 while ((line = metadata_br.readLine()) != null)
175 {
176 Matcher type_matcher = type_pattern.matcher(line);
177 if (type_matcher.matches())
178 {
179 type = type_matcher.group(1);
180 }
181 else
182 {
183 Matcher metadata_matcher = metadata_pattern.matcher(line);
184 if (metadata_matcher.matches())
185 {
186 String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
187 String value = metadata_matcher.group(2);
188 logger.info("VideoDocument - adding metadata: " + field + " => " + value);
189 this.properties.put(field, value);
190 }
191 }
192 }
193 int metadata_status = metadata_process.waitFor();
194 }
195 catch (Exception e)
196 {
197 logger.error("Exception while extracting video metadata:", e);
198 }
199
200 // 2. Convert Video to streamable format using HandbrakeCLI
201 logger.info("VideoDocument - convert video to streamable format [PR]");
202 Path converted_video_path = assoc_path.resolve("tsv.mp4");
203 try
204 {
205 String convert_command[] = {
206 "HandBrakeCLI",
207 "-i", raw_video_path.toString(),
208 "-t", "1",
209 "-c", "1",
210 "-o", converted_video_path.toString(),
211 "-f", "mp4",
212 "-O",
213 "-w", this.streaming_hq_size,
214 "--loose-anamorphic",
215 "-e", "x264",
216 "-b", this.streaming_hq_video_bitrate,
217 "-a", "1",
218 "-E", "faac",
219 "-6", "dpl2",
220 "-R", "Auto",
221 "-B", this.streaming_hq_audio_bitrate,
222 "-D", "0.0",
223 "-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
224 };
225 logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
226 // @todo determine the best way to account for configuration options
227 Process convert_process = Runtime.getRuntime().exec(convert_command);
228 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
229 convert_process_error_gobbler.start();
230 StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
231 convert_process_out_gobbler.start();
232 int convert_status = convert_process.waitFor();
233 if (convert_status != 0 || !Files.exists(converted_video_path))
234 {
235 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
236 }
237 }
238 catch (Exception e)
239 {
240 logger.error("Exception while converting video to streamable format: ", e);
241 }
242
243 // 3. Generate keyframes from streamable video and attach the shot names
244 // as a property
245 logger.info("VideoDocument - extracting keyframes from video [PR]");
246 try
247 {
248 Path shots_path = assoc_path.resolve("shots.xml");
249 String keyframe_command[] = {
250 "hive2_ffmpegsvn",
251 "-o", shots_path.toString(),
252 "-k", assoc_path.toString(),
253 "-m", "0.5",
254 "-l", "0.05",
255 converted_video_path.toString()
256 };
257 logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
258 Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
259 //Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
260 StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
261 keyframe_error_gobbler.start();
262 //Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
263 StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
264 keyframe_out_gobbler.start();
265 int keyframe_status = keyframe_process.waitFor();
266 if (keyframe_status != 0 || !Files.exists(shots_path))
267 {
268 throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
269 }
270
271 logger.info("VideoDocument - associating keyframes to video [IO]");
272 File files[] = assoc_path.toFile().listFiles();
273 Arrays.sort(files);
274 Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
275 String keyframe_images = "";
276 for (int i = 0; i < files.length; i++)
277 {
278 String image_filename = files[i].toPath().getFileName().toString();
279 logger.info("VideoDocument - considering keyframe image: " + image_filename);
280 Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
281 if (image_filename_matcher.matches())
282 {
283 if (keyframe_images.equals(""))
284 {
285 keyframe_images = image_filename;
286 }
287 else
288 {
289 keyframe_images += "," + image_filename;
290 }
291 }
292 }
293 this.properties.put("preview", keyframe_images);
294 }
295 catch (Exception e)
296 {
297 logger.error("Exception while extracting keyframes from video: ", e);
298 }
299
300 // 4. Create a dummy reader around some dummy text and then tokenize it
301 logger.info("VideoDocument - feed dummy text as token stream to indexer");
302 try
303 {
304 this.reader = new StringReader(this.properties.get("abstract"));
305 this.tokenizer = tok.tokenise(this.reader);
306 }
307 catch (Exception e)
308 {
309 logger.error("Exception while creating dummy text stream: ", e);
310 }
311 logger.info("VideoDocument - Complete!");
312 }
313 /** VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
314
315 /** Returns true when the end of the document has been reached, and there
316 * are no other terms to be retrieved from it.
317 * @return boolean true if there are no more terms in the document, otherwise
318 * it returns false.
319 */
320 public boolean endOfDocument()
321 {
322 return !this.tokenizer.hasNext();
323 }
324 /** endOfDocument() **/
325
326 /** Returns the underlying map of all the properties defined by this Document.
327 * @since 1.1.0
328 */
329 public Map<String,String> getAllProperties()
330 {
331 return this.properties;
332 }
333 /** getAllProperties() **/
334
335 /** Returns a list of the fields the current term appears in.
336 * @return HashSet a set of the terms that the current term appears in.
337 */
338 public Set<String> getFields()
339 {
340 // Returns null because there is no support for fields with file documents.
341 return Collections.emptySet();
342 }
343 /** getFields() **/
344
345 /** Gets the next term of the document.
346 * <B>NB:</B>Null string returned from getNextTerm() should
347 * be ignored. They do not signify the lack of any more terms.
348 * endOfDocument() should be used to check that.
349 * @return String the next term of the document. Null returns should be
350 * ignored.
351 */
352 public String getNextTerm()
353 {
354 return this.tokenizer.next();
355 }
356 /** getNextTerm() **/
357
358 /** Allows access to a named property of the Document. Examples might be URL,
359 * filename etc.
360 * @param name Name of the property. It is suggested, but not required that
361 * this name should not be case insensitive.
362 * @since 1.1.0
363 */
364 public String getProperty(String name)
365 {
366 return this.properties.get(name.toLowerCase());
367 }
368 /** getProperty(String name) **/
369
370 /** Returns a Reader object so client code can tokenise the document
371 * or deal with the document itself. Examples might be extracting URLs,
372 * language detection. */
373 public Reader getReader()
374 {
375 return this.reader;
376 }
377 /** getReader() **/
378
379 /**
380 */
381 private String generateHash(String string)
382 {
383 StringBuffer sb = new StringBuffer();
384 try
385 {
386 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
387 message_digest.reset();
388 message_digest.update(string.getBytes(Charset.forName("UTF8")));
389 final byte[] result_bytes = message_digest.digest();
390 for (int i = 0; i < result_bytes.length; ++i)
391 {
392 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
393 }
394 }
395 catch (NoSuchAlgorithmException e)
396 {
397 System.err.println("Exception: " + e);
398 System.exit(0);
399 }
400 return sb.toString();
401 }
402 /** generateHash(String) **/
403}
Note: See TracBrowser for help on using the repository browser.