Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

ImageDocument.java

Last change on this file was 29648, checked in by jmt12, 9 years ago
Extending the Image document class with SIFT processing so as to trigger greater CPU load. Makes use of stream gobbler... gobble-gobble
File size: 12.5 KB

Line
1	/**
2	* Adding support for Images in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.InputStream;
20	import java.io.StringReader;
21	import java.io.Reader;
22	import java.nio.charset.Charset;
23	import java.nio.file.Files;
24	import java.nio.file.Path;
25	import java.nio.file.Paths;
26	import java.security.MessageDigest;
27	import java.security.NoSuchAlgorithmException;
28	import java.util.Collections;
29	import java.util.Arrays;
30	import java.util.Map;
31	import java.util.Set;
32
33	import org.apache.log4j.Logger;
34	import org.terrier.indexing.StreamGobbler;
35	import org.terrier.indexing.tokenisation.TokenStream;
36	import org.terrier.indexing.tokenisation.Tokeniser;
37	import org.terrier.utility.ApplicationSetup;
38
39	public class ImageDocument
40	implements Document
41	{
42	/** A reference to the logger for messaging */
43	protected static final Logger logger = Logger.getLogger(FileDocument.class);
44	/** The map of properties (fields) for this document. */
45	protected Map<String,String> properties;
46	/** A reader built from a dummy text string. */
47	protected Reader reader;
48	/** A token stream produced by the configured tokeniser when feed the dummy
49	* reader.
50	*/
51	protected TokenStream tokenizer;
52
53	/ The preview filetype. /
54	protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
55	/ The preview size (width). /
56	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
57
58	/ Default constructor. /
59	protected ImageDocument() {}
60
61	/** Constructs an instance of the ImageDocument from the given input stream.
62	* @param docStream the input stream that reads the file.
63	* @param docProperties the initial properties (docno, filename)
64	* @param tok the tokeniser defined for this collection
65	*/
66	public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
67	{
68	logger.info("ImageDocument::ImageDocument()");
69	// Initialization from arguments
70	this.properties = default_properties;
71
72	// Set properties
73	logger.info("ImageDocument - current properties");
74	for (Map.Entry<String, String> entry : this.properties.entrySet())
75	{
76	logger.info(entry.getKey() + "=" + entry.getValue());
77	}
78
79	logger.info("ImageDocument - extracting properties");
80	// A. Hardcoded properties
81	this.properties.put("parser", "ImageDocument");
82	this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
83	// B. Properties derived from filename
84	// - A simple title for the document
85	String filepath = this.properties.get("filename");
86	String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
87	this.properties.put("title", title);
88	String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
89	// - The name of the copy of the original document
90	String target_filename = "doc." + ext;
91	this.properties.put("source","doc." + ext);
92	// - A unique associated directory. This gets a little tricky as we need
93	// to create the directory at the same time if an effort to promote
94	// synchronous behaviour
95	String unique_id = this.generateHash(filepath);
96	// - we start with the first 4 characters
97	int offset = 0;
98	String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
99	// - we add ".dir" as a suffix to the directory that actually contains
100	// files (so the non-suffixed version contains nested directories)
101	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
102	// - then we continue adding blocks of 4 characters until we get a
103	// directory that doesn't already exist
104	while (assoc_path.toFile().exists() && offset < unique_id.length())
105	{
106	offset += 4;
107	assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
108	assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
109	}
110	// - still not unique? but run out of unique_id... time to complain
111	if (assoc_path.toFile().exists())
112	{
113	logger.error("ImageDoument - can't determine unique assocfilepath");
114	System.exit(0);
115	}
116	// - create the directories quick... hopefully before someone else does
117	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
118	this.properties.put("assocfile", assoc_filename);
119
120	// Copy (symlink) the file into place in the shared directory
121	Path source_path = Paths.get(properties.get("filename"));
122	Path target_path = assoc_path.resolve(target_filename);
123	if (target_path.toFile().exists())
124	{
125	logger.info("ImageDocument - removing existing (old) associated image");
126	try
127	{
128	Files.delete(target_path);
129	}
130	catch (Exception e)
131	{
132	logger.error("Exception while deleting old image: ", e);
133	}
134	}
135	logger.info("ImageDocument - symlinking image into assoc directory");
136	try
137	{
138	Files.createSymbolicLink(target_path, source_path);
139	}
140	// not supported? We'll try copying below
141	catch (UnsupportedOperationException ex)
142	{
143	}
144	// All other exceptions can be fatal
145	catch (Exception e)
146	{
147	logger.error("Exception while symlinking image: ", e);
148	}
149	// - copy if the file doesn't exist yet
150	if (!target_path.toFile().exists())
151	{
152	logger.info("ImageDocument - symlink filaed, copying instead");
153	try
154	{
155	Files.copy(source_path, target_path);
156	}
157	// Fatality!
158	catch (Exception e)
159	{
160	logger.error("Exception while copying image: ", e);
161	}
162	}
163
164	// Generate preview image
165	logger.info("ImageDocument - generate preview image");
166	try
167	{
168	String preview_filename = this.generatePreview(source_path, assoc_path);
169	this.properties.put("preview",preview_filename);
170	}
171	catch (Exception e)
172	{
173	logger.error("Exception while generating preview image: ", e);
174	}
175
176	// Create a dummy reader around some dummy text and then tokenize it
177	logger.info("ImageDocument - feed dummy text as token stream to indexer");
178	try
179	{
180	this.reader = new StringReader(this.properties.get("abstract"));
181	this.tokenizer = tok.tokenise(this.reader);
182	}
183	catch (Exception e)
184	{
185	logger.error("Exception while creating dummy text stream: ", e);
186	}
187
188	// Use OpenSIFT to generate a featureset (in Oxford format) for this image
189	logger.info("ImageDocument - generate and record SIFT features");
190	try
191	{
192	String sift_command[] = {
193	"siftfeat",
194	"-x",
195	source_path.toString()
196	};
197	logger.info("ImageDocument - sift command: " + Arrays.toString(sift_command));
198	Process sift_process = Runtime.getRuntime().exec(sift_command);
199	// we'd usually send STDERR to /dev/null, but a streamgobbler is easier
200	// in Java
201	StreamGobbler sift_process_error_gobbler = new StreamGobbler(sift_process.getErrorStream());
202	sift_process_error_gobbler.start();
203	// the SIFT features, in Oxford format, will arrive from STDOUT
204	BufferedReader sift_br = new BufferedReader(new InputStreamReader(sift_process.getInputStream()));
205	String line;
206	StringBuffer oxford_features;
207	while ((line = sift_br.readLine()) != null)
208	{
209	oxford_features.append(line);
210	}
211	// this command blocks until process completes (emit return value) which
212	// should be shortly after it emits the last line of SIFT feature data
213	int sift_status = sift_process.waitFor();
214	this.properties.put("sift", oxford_features.toString());
215	}
216	catch (Exception e)
217	{
218	logger.error("Exception while generating preview image: ", e);
219	}
220
221	logger.info("ImageDocument - Complete!");
222	}
223	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
224
225	/** Returns true when the end of the document has been reached, and there
226	* are no other terms to be retrieved from it.
227	* @return boolean true if there are no more terms in the document, otherwise
228	* it returns false.
229	*/
230	public boolean endOfDocument()
231	{
232	return !this.tokenizer.hasNext();
233	}
234	/ endOfDocument() /
235
236	/** Use ImageMagick to generate a preview image.
237	* @pre assumes you have ImageMagick installed and available on Path
238	* @pre uses member variables preview_format and preview_width
239	* @return the filename of the preview image (within the assoc directory)
240	*/
241	private String generatePreview(Path source_path, Path assoc_path)
242	throws Exception
243	{
244	String preview_filename = "preview." + this.preview_format;
245	Path preview_path = assoc_path.resolve(preview_filename);
246	String convert_command[] = {
247	"convert",
248	source_path.toString(),
249	"-resize",
250	this.preview_width + "x",
251	preview_path.toString()
252	};
253	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
254	Process convert_process = Runtime.getRuntime().exec(convert_command);
255	// Gobble up the streams to prevent them hanging the process when buffers
256	// are full
257	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
258	convert_process_error_gobbler.start();
259	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
260	convert_process_input_gobbler.start();
261	// Let the conversion finish
262	int convert_status = convert_process.waitFor();
263	if (convert_status != 0 \|\| !preview_path.toFile().exists())
264	{
265	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
266	}
267	return preview_filename;
268	}
269	/ generatePreview(Path, Path) /
270
271	/** Returns the underlying map of all the properties defined by this Document.
272	* @since 1.1.0
273	*/
274	public Map<String,String> getAllProperties()
275	{
276	return this.properties;
277	}
278	/ getAllProperties() /
279
280	/** Returns a list of the fields the current term appears in.
281	* @return HashSet a set of the terms that the current term appears in.
282	*/
283	public Set<String> getFields()
284	{
285	// Returns null because there is no support for fields with file documents.
286	return Collections.emptySet();
287	}
288	/ getFields() /
289
290	/** Gets the next term of the document.
291	* <B>NB:</B>Null string returned from getNextTerm() should
292	* be ignored. They do not signify the lack of any more terms.
293	* endOfDocument() should be used to check that.
294	* @return String the next term of the document. Null returns should be
295	* ignored.
296	*/
297	public String getNextTerm()
298	{
299	return this.tokenizer.next();
300	}
301	/ getNextTerm() /
302
303	/** Allows access to a named property of the Document. Examples might be URL,
304	* filename etc.
305	* @param name Name of the property. It is suggested, but not required that
306	* this name should not be case insensitive.
307	* @since 1.1.0
308	*/
309	public String getProperty(String name)
310	{
311	return this.properties.get(name.toLowerCase());
312	}
313	/ getProperty(String name) /
314
315	/** Returns a Reader object so client code can tokenise the document
316	* or deal with the document itself. Examples might be extracting URLs,
317	* language detection. */
318	public Reader getReader()
319	{
320	return this.reader;
321	}
322	/ getReader() /
323
324	/**
325	*/
326	private String generateHash(String string)
327	{
328	StringBuffer sb = new StringBuffer();
329	try
330	{
331	final MessageDigest message_digest = MessageDigest.getInstance("MD5");
332	message_digest.reset();
333	message_digest.update(string.getBytes(Charset.forName("UTF8")));
334	final byte[] result_bytes = message_digest.digest();
335	for (int i = 0; i < result_bytes.length; ++i)
336	{
337	sb.append(Integer.toHexString((result_bytes[i] & 0xFF) \| 0x100).substring(1,3));
338	}
339	}
340	catch (NoSuchAlgorithmException e)
341	{
342	System.err.println("Exception: " + e);
343	System.exit(0);
344	}
345	return sb.toString();
346	}
347	/ generateHash(String) /
348	}
349

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java

Download in other formats: