Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/VideoDocument.java@ 26235

Last change on this file since 26235 was 26235, checked in by jmt12, 12 years ago
Extending debug comments with an indicator of whether this is IO time or processor (PR) time.
File size: 15.0 KB

Line
1	/**
2	* Adding support for Videos in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.BufferedReader;
20	import java.io.File;
21	import java.io.InputStream;
22	import java.io.InputStreamReader;
23	import java.io.IOException;
24	import java.io.StringReader;
25	import java.io.Reader;
26	import java.nio.charset.Charset;
27	import java.nio.file.Files;
28	import java.nio.file.Path;
29	import java.nio.file.Paths;
30	import java.nio.file.SimpleFileVisitor;
31	import java.nio.file.attribute.BasicFileAttributes;
32	import java.security.MessageDigest;
33	import java.security.NoSuchAlgorithmException;
34	import java.util.Collections;
35	import java.util.Arrays;
36	import java.util.Map;
37	import java.util.Set;
38	import java.util.regex.Matcher;
39	import java.util.regex.Pattern;
40
41	import org.apache.log4j.Logger;
42	import org.terrier.indexing.StreamGobbler;
43	import org.terrier.indexing.tokenisation.TokenStream;
44	import org.terrier.indexing.tokenisation.Tokeniser;
45	import org.terrier.utility.ApplicationSetup;
46
47	public class VideoDocument
48	implements Document
49	{
50	/** A reference to the logger for messaging */
51	protected static final Logger logger = Logger.getLogger(FileDocument.class);
52	/** The map of properties (fields) for this document. */
53	protected Map<String,String> properties;
54	/** A reader built from a dummy text string. */
55	protected Reader reader;
56	/** A token stream produced by the configured tokeniser when feed the dummy
57	* reader.
58	*/
59	protected TokenStream tokenizer;
60
61	// Handbrake Configuration
62	protected String streaming_hq_size = ApplicationSetup.getProperty("VideoDocument.streaming_hq_size", "720");
63	protected String streaming_hq_video_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_video_bitrate", "496");
64	protected String streaming_hq_audio_bitrate = ApplicationSetup.getProperty("VideoDocument.streaming_hq_audio_bitrate", "80");
65
66	/ The preview size (width). /
67	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
68
69
70	/ Default constructor. /
71	protected VideoDocument() {}
72
73	/** Constructs an instance of the ImageDocument from the given input stream.
74	* @param docStream the input stream that reads the file.
75	* @param docProperties the initial properties (docno, filename)
76	* @param tok the tokeniser defined for this collection
77	*/
78	public VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
79	{
80	logger.info("VideoDocument::VideoDocument()");
81	// 0. Initialization from arguments
82	this.properties = default_properties;
83
84	// Set properties
85	logger.info("VideoDocument - extracting properties");
86	// A. Hardcoded properties
87	this.properties.put("parser", "VideoDocument");
88	this.properties.put("abstract", "This is a video so here is some dummy text to prevent indexer failing.");
89	// B. Properties derived from filename
90	String filepath = this.properties.get("filename");
91	String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
92	this.properties.put("title", title);
93	String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
94	String target_filename = "doc." + ext;
95	this.properties.put("source","doc." + ext);
96	// - A unique associated directory. This gets a little tricky as we need
97	// to create the directory at the same time if an effort to promote
98	// synchronous behaviour
99	String unique_id = this.generateHash(filepath);
100	// - we start with the first 4 characters
101	int offset = 0;
102	String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
103	// - we add ".dir" as a suffix to the directory that actually contains
104	// files (so the non-suffixed version contains nested directories)
105	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
106	// - then we continue adding blocks of 4 characters until we get a
107	// directory that doesn't already exist
108	while (assoc_path.toFile().exists() && offset < unique_id.length())
109	{
110	offset += 4;
111	assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
112	assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
113	}
114	// - still not unique? but run out of unique_id... time to complain
115	if (assoc_path.toFile().exists())
116	{
117	logger.error("ImageDoument - can't determine unique assocfilepath");
118	System.exit(0);
119	}
120	// - create the directories quick... hopefully before someone else does
121	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
122	this.properties.put("assocfile", assoc_filename);
123
124	// Copy (symlink) the file into place in the shared directory
125	Path raw_video_path = Paths.get(properties.get("filename"));
126	Path target_path = assoc_path.resolve(target_filename);
127	logger.info("VideoDocument - symlinking original video into assoc directory [IO]");
128	try
129	{
130	Files.createSymbolicLink(target_path, raw_video_path);
131	}
132	// not supported? We'll try copying below
133	catch (UnsupportedOperationException ex)
134	{
135	}
136	// All other exceptions can be fatal
137	catch (Exception e)
138	{
139	logger.error("Exception while symlinking video: ", e);
140	}
141	// - copy if the file doesn't exist yet
142	if (!target_path.toFile().exists())
143	{
144	logger.info("VideoDocument - symlink failed, copying instead [IO]");
145	try
146	{
147	Files.copy(raw_video_path, target_path);
148	}
149	// Fatality!
150	catch (Exception e)
151	{
152	logger.error("Exception while copying video: ", e);
153	}
154	}
155
156	// 1. Extract Metadata using MediaInfo and store as properties
157	logger.info("VideoDocument - extracting video metadata [PR]");
158	try
159	{
160	String metadata_command[] = {
161	"mediainfo",
162	"--Output=XML",
163	raw_video_path.toString()
164	};
165	logger.info("VideoDocument - metadata command: " + Arrays.toString(metadata_command));
166	Process metadata_process = Runtime.getRuntime().exec(metadata_command);
167	StreamGobbler metadata_process_error_gobbler = new StreamGobbler(metadata_process.getErrorStream());
168	metadata_process_error_gobbler.start();
169	BufferedReader metadata_br = new BufferedReader(new InputStreamReader(metadata_process.getInputStream()));
170	String line;
171	String type = "Unknown";
172	Pattern type_pattern = Pattern.compile("<track type=\"([a-zA-Z]+)\">");
173	Pattern metadata_pattern = Pattern.compile("<([a-zA-Z_]+)>(.*)</\\1>");
174	while ((line = metadata_br.readLine()) != null)
175	{
176	Matcher type_matcher = type_pattern.matcher(line);
177	if (type_matcher.matches())
178	{
179	type = type_matcher.group(1);
180	}
181	else
182	{
183	Matcher metadata_matcher = metadata_pattern.matcher(line);
184	if (metadata_matcher.matches())
185	{
186	String field = type.toLowerCase() + ":" + metadata_matcher.group(1).toLowerCase();
187	String value = metadata_matcher.group(2);
188	logger.info("VideoDocument - adding metadata: " + field + " => " + value);
189	this.properties.put(field, value);
190	}
191	}
192	}
193	int metadata_status = metadata_process.waitFor();
194	}
195	catch (Exception e)
196	{
197	logger.error("Exception while extracting video metadata:", e);
198	}
199
200	// 2. Convert Video to streamable format using HandbrakeCLI
201	logger.info("VideoDocument - convert video to streamable format [PR]");
202	Path converted_video_path = assoc_path.resolve("tsv.mp4");
203	try
204	{
205	String convert_command[] = {
206	"HandBrakeCLI",
207	"-i", raw_video_path.toString(),
208	"-t", "1",
209	"-c", "1",
210	"-o", converted_video_path.toString(),
211	"-f", "mp4",
212	"-O",
213	"-w", this.streaming_hq_size,
214	"--loose-anamorphic",
215	"-e", "x264",
216	"-b", this.streaming_hq_video_bitrate,
217	"-a", "1",
218	"-E", "faac",
219	"-6", "dpl2",
220	"-R", "Auto",
221	"-B", this.streaming_hq_audio_bitrate,
222	"-D", "0.0",
223	"-x", "ref=2:bframes=2:subq=6:mixed-refs=0:weightb=0:8x8dct=0:trellis=0:threads=1"
224	};
225	logger.info("VideoDocument - convert command: " + Arrays.toString(convert_command));
226	// @todo determine the best way to account for configuration options
227	Process convert_process = Runtime.getRuntime().exec(convert_command);
228	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
229	convert_process_error_gobbler.start();
230	StreamGobbler convert_process_out_gobbler = new StreamGobbler(convert_process.getInputStream());
231	convert_process_out_gobbler.start();
232	int convert_status = convert_process.waitFor();
233	if (convert_status != 0 \|\| !Files.exists(converted_video_path))
234	{
235	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
236	}
237	}
238	catch (Exception e)
239	{
240	logger.error("Exception while converting video to streamable format: ", e);
241	}
242
243	// 3. Generate keyframes from streamable video and attach the shot names
244	// as a property
245	logger.info("VideoDocument - extracting keyframes from video [PR]");
246	try
247	{
248	Path shots_path = assoc_path.resolve("shots.xml");
249	String keyframe_command[] = {
250	"hive2_ffmpegsvn",
251	"-o", shots_path.toString(),
252	"-k", assoc_path.toString(),
253	"-m", "0.5",
254	"-l", "0.05",
255	converted_video_path.toString()
256	};
257	logger.info("VideoDocument - keyframe command: " + Arrays.toString(keyframe_command));
258	Process keyframe_process = Runtime.getRuntime().exec(keyframe_command);
259	//Path keyframe_err_file = temp_dir_path.resolve("hive2-err.txt");
260	StreamGobbler keyframe_error_gobbler = new StreamGobbler(keyframe_process.getErrorStream());//, keyframe_err_file.toString());
261	keyframe_error_gobbler.start();
262	//Path keyframe_out_file = temp_dir_path.resolve("hive2-out.txt");
263	StreamGobbler keyframe_out_gobbler = new StreamGobbler(keyframe_process.getInputStream()); //, keyframe_out_file.toString());
264	keyframe_out_gobbler.start();
265	int keyframe_status = keyframe_process.waitFor();
266	if (keyframe_status != 0 \|\| !Files.exists(shots_path))
267	{
268	throw new Exception("Keyframe command failed (exit status: " + keyframe_status + ")");
269	}
270
271	logger.info("VideoDocument - associating keyframes to video [IO]");
272	File files[] = assoc_path.toFile().listFiles();
273	Arrays.sort(files);
274	Pattern image_filename_pattern = Pattern.compile("tsv.*\\.jpg");
275	String keyframe_images = "";
276	for (int i = 0; i < files.length; i++)
277	{
278	String image_filename = files[i].toPath().getFileName().toString();
279	logger.info("VideoDocument - considering keyframe image: " + image_filename);
280	Matcher image_filename_matcher = image_filename_pattern.matcher(image_filename);
281	if (image_filename_matcher.matches())
282	{
283	if (keyframe_images.equals(""))
284	{
285	keyframe_images = image_filename;
286	}
287	else
288	{
289	keyframe_images += "," + image_filename;
290	}
291	}
292	}
293	this.properties.put("preview", keyframe_images);
294	}
295	catch (Exception e)
296	{
297	logger.error("Exception while extracting keyframes from video: ", e);
298	}
299
300	// 4. Create a dummy reader around some dummy text and then tokenize it
301	logger.info("VideoDocument - feed dummy text as token stream to indexer");
302	try
303	{
304	this.reader = new StringReader(this.properties.get("abstract"));
305	this.tokenizer = tok.tokenise(this.reader);
306	}
307	catch (Exception e)
308	{
309	logger.error("Exception while creating dummy text stream: ", e);
310	}
311	logger.info("VideoDocument - Complete!");
312	}
313	/ VideoDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
314
315	/** Returns true when the end of the document has been reached, and there
316	* are no other terms to be retrieved from it.
317	* @return boolean true if there are no more terms in the document, otherwise
318	* it returns false.
319	*/
320	public boolean endOfDocument()
321	{
322	return !this.tokenizer.hasNext();
323	}
324	/ endOfDocument() /
325
326	/** Returns the underlying map of all the properties defined by this Document.
327	* @since 1.1.0
328	*/
329	public Map<String,String> getAllProperties()
330	{
331	return this.properties;
332	}
333	/ getAllProperties() /
334
335	/** Returns a list of the fields the current term appears in.
336	* @return HashSet a set of the terms that the current term appears in.
337	*/
338	public Set<String> getFields()
339	{
340	// Returns null because there is no support for fields with file documents.
341	return Collections.emptySet();
342	}
343	/ getFields() /
344
345	/** Gets the next term of the document.
346	* <B>NB:</B>Null string returned from getNextTerm() should
347	* be ignored. They do not signify the lack of any more terms.
348	* endOfDocument() should be used to check that.
349	* @return String the next term of the document. Null returns should be
350	* ignored.
351	*/
352	public String getNextTerm()
353	{
354	return this.tokenizer.next();
355	}
356	/ getNextTerm() /
357
358	/** Allows access to a named property of the Document. Examples might be URL,
359	* filename etc.
360	* @param name Name of the property. It is suggested, but not required that
361	* this name should not be case insensitive.
362	* @since 1.1.0
363	*/
364	public String getProperty(String name)
365	{
366	return this.properties.get(name.toLowerCase());
367	}
368	/ getProperty(String name) /
369
370	/** Returns a Reader object so client code can tokenise the document
371	* or deal with the document itself. Examples might be extracting URLs,
372	* language detection. */
373	public Reader getReader()
374	{
375	return this.reader;
376	}
377	/ getReader() /
378
379	/**
380	*/
381	private String generateHash(String string)
382	{
383	StringBuffer sb = new StringBuffer();
384	try
385	{
386	final MessageDigest message_digest = MessageDigest.getInstance("MD5");
387	message_digest.reset();
388	message_digest.update(string.getBytes(Charset.forName("UTF8")));
389	final byte[] result_bytes = message_digest.digest();
390	for (int i = 0; i < result_bytes.length; ++i)
391	{
392	sb.append(Integer.toHexString((result_bytes[i] & 0xFF) \| 0x100).substring(1,3));
393	}
394	}
395	catch (NoSuchAlgorithmException e)
396	{
397	System.err.println("Exception: " + e);
398	System.exit(0);
399	}
400	return sb.toString();
401	}
402	/ generateHash(String) /
403	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: