source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java

Last change on this file was 29648, checked in by jmt12, 9 years ago

Extending the Image document class with SIFT processing so as to trigger greater CPU load. Makes use of stream gobbler... gobble-gobble

File size: 12.5 KB
Line 
1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.charset.Charset;
23import java.nio.file.Files;
24import java.nio.file.Path;
25import java.nio.file.Paths;
26import java.security.MessageDigest;
27import java.security.NoSuchAlgorithmException;
28import java.util.Collections;
29import java.util.Arrays;
30import java.util.Map;
31import java.util.Set;
32
33import org.apache.log4j.Logger;
34import org.terrier.indexing.StreamGobbler;
35import org.terrier.indexing.tokenisation.TokenStream;
36import org.terrier.indexing.tokenisation.Tokeniser;
37import org.terrier.utility.ApplicationSetup;
38
39public class ImageDocument
40 implements Document
41{
42 /** A reference to the logger for messaging */
43 protected static final Logger logger = Logger.getLogger(FileDocument.class);
44 /** The map of properties (fields) for this document. */
45 protected Map<String,String> properties;
46 /** A reader built from a dummy text string. */
47 protected Reader reader;
48 /** A token stream produced by the configured tokeniser when feed the dummy
49 * reader.
50 */
51 protected TokenStream tokenizer;
52
53 /** The preview filetype. **/
54 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
55 /** The preview size (width). **/
56 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
57
58 /** Default constructor. **/
59 protected ImageDocument() {}
60
61 /** Constructs an instance of the ImageDocument from the given input stream.
62 * @param docStream the input stream that reads the file.
63 * @param docProperties the initial properties (docno, filename)
64 * @param tok the tokeniser defined for this collection
65 */
66 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
67 {
68 logger.info("ImageDocument::ImageDocument()");
69 // Initialization from arguments
70 this.properties = default_properties;
71
72 // Set properties
73 logger.info("ImageDocument - current properties");
74 for (Map.Entry<String, String> entry : this.properties.entrySet())
75 {
76 logger.info(entry.getKey() + "=" + entry.getValue());
77 }
78
79 logger.info("ImageDocument - extracting properties");
80 // A. Hardcoded properties
81 this.properties.put("parser", "ImageDocument");
82 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
83 // B. Properties derived from filename
84 // - A simple title for the document
85 String filepath = this.properties.get("filename");
86 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
87 this.properties.put("title", title);
88 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
89 // - The name of the copy of the original document
90 String target_filename = "doc." + ext;
91 this.properties.put("source","doc." + ext);
92 // - A unique associated directory. This gets a little tricky as we need
93 // to create the directory at the same time if an effort to promote
94 // synchronous behaviour
95 String unique_id = this.generateHash(filepath);
96 // - we start with the first 4 characters
97 int offset = 0;
98 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
99 // - we add ".dir" as a suffix to the directory that actually contains
100 // files (so the non-suffixed version contains nested directories)
101 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
102 // - then we continue adding blocks of 4 characters until we get a
103 // directory that doesn't already exist
104 while (assoc_path.toFile().exists() && offset < unique_id.length())
105 {
106 offset += 4;
107 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
108 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
109 }
110 // - still not unique? but run out of unique_id... time to complain
111 if (assoc_path.toFile().exists())
112 {
113 logger.error("ImageDoument - can't determine unique assocfilepath");
114 System.exit(0);
115 }
116 // - create the directories quick... hopefully before someone else does
117 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
118 this.properties.put("assocfile", assoc_filename);
119
120 // Copy (symlink) the file into place in the shared directory
121 Path source_path = Paths.get(properties.get("filename"));
122 Path target_path = assoc_path.resolve(target_filename);
123 if (target_path.toFile().exists())
124 {
125 logger.info("ImageDocument - removing existing (old) associated image");
126 try
127 {
128 Files.delete(target_path);
129 }
130 catch (Exception e)
131 {
132 logger.error("Exception while deleting old image: ", e);
133 }
134 }
135 logger.info("ImageDocument - symlinking image into assoc directory");
136 try
137 {
138 Files.createSymbolicLink(target_path, source_path);
139 }
140 // not supported? We'll try copying below
141 catch (UnsupportedOperationException ex)
142 {
143 }
144 // All other exceptions can be fatal
145 catch (Exception e)
146 {
147 logger.error("Exception while symlinking image: ", e);
148 }
149 // - copy if the file doesn't exist yet
150 if (!target_path.toFile().exists())
151 {
152 logger.info("ImageDocument - symlink filaed, copying instead");
153 try
154 {
155 Files.copy(source_path, target_path);
156 }
157 // Fatality!
158 catch (Exception e)
159 {
160 logger.error("Exception while copying image: ", e);
161 }
162 }
163
164 // Generate preview image
165 logger.info("ImageDocument - generate preview image");
166 try
167 {
168 String preview_filename = this.generatePreview(source_path, assoc_path);
169 this.properties.put("preview",preview_filename);
170 }
171 catch (Exception e)
172 {
173 logger.error("Exception while generating preview image: ", e);
174 }
175
176 // Create a dummy reader around some dummy text and then tokenize it
177 logger.info("ImageDocument - feed dummy text as token stream to indexer");
178 try
179 {
180 this.reader = new StringReader(this.properties.get("abstract"));
181 this.tokenizer = tok.tokenise(this.reader);
182 }
183 catch (Exception e)
184 {
185 logger.error("Exception while creating dummy text stream: ", e);
186 }
187
188 // Use OpenSIFT to generate a featureset (in Oxford format) for this image
189 logger.info("ImageDocument - generate and record SIFT features");
190 try
191 {
192 String sift_command[] = {
193 "siftfeat",
194 "-x",
195 source_path.toString()
196 };
197 logger.info("ImageDocument - sift command: " + Arrays.toString(sift_command));
198 Process sift_process = Runtime.getRuntime().exec(sift_command);
199 // we'd usually send STDERR to /dev/null, but a streamgobbler is easier
200 // in Java
201 StreamGobbler sift_process_error_gobbler = new StreamGobbler(sift_process.getErrorStream());
202 sift_process_error_gobbler.start();
203 // the SIFT features, in Oxford format, will arrive from STDOUT
204 BufferedReader sift_br = new BufferedReader(new InputStreamReader(sift_process.getInputStream()));
205 String line;
206 StringBuffer oxford_features;
207 while ((line = sift_br.readLine()) != null)
208 {
209 oxford_features.append(line);
210 }
211 // this command blocks until process completes (emit return value) which
212 // should be shortly after it emits the last line of SIFT feature data
213 int sift_status = sift_process.waitFor();
214 this.properties.put("sift", oxford_features.toString());
215 }
216 catch (Exception e)
217 {
218 logger.error("Exception while generating preview image: ", e);
219 }
220
221 logger.info("ImageDocument - Complete!");
222 }
223 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
224
225 /** Returns true when the end of the document has been reached, and there
226 * are no other terms to be retrieved from it.
227 * @return boolean true if there are no more terms in the document, otherwise
228 * it returns false.
229 */
230 public boolean endOfDocument()
231 {
232 return !this.tokenizer.hasNext();
233 }
234 /** endOfDocument() **/
235
236 /** Use ImageMagick to generate a preview image.
237 * @pre assumes you have ImageMagick installed and available on Path
238 * @pre uses member variables preview_format and preview_width
239 * @return the filename of the preview image (within the assoc directory)
240 */
241 private String generatePreview(Path source_path, Path assoc_path)
242 throws Exception
243 {
244 String preview_filename = "preview." + this.preview_format;
245 Path preview_path = assoc_path.resolve(preview_filename);
246 String convert_command[] = {
247 "convert",
248 source_path.toString(),
249 "-resize",
250 this.preview_width + "x",
251 preview_path.toString()
252 };
253 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
254 Process convert_process = Runtime.getRuntime().exec(convert_command);
255 // Gobble up the streams to prevent them hanging the process when buffers
256 // are full
257 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
258 convert_process_error_gobbler.start();
259 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
260 convert_process_input_gobbler.start();
261 // Let the conversion finish
262 int convert_status = convert_process.waitFor();
263 if (convert_status != 0 || !preview_path.toFile().exists())
264 {
265 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
266 }
267 return preview_filename;
268 }
269 /** generatePreview(Path, Path) **/
270
271 /** Returns the underlying map of all the properties defined by this Document.
272 * @since 1.1.0
273 */
274 public Map<String,String> getAllProperties()
275 {
276 return this.properties;
277 }
278 /** getAllProperties() **/
279
280 /** Returns a list of the fields the current term appears in.
281 * @return HashSet a set of the terms that the current term appears in.
282 */
283 public Set<String> getFields()
284 {
285 // Returns null because there is no support for fields with file documents.
286 return Collections.emptySet();
287 }
288 /** getFields() **/
289
290 /** Gets the next term of the document.
291 * <B>NB:</B>Null string returned from getNextTerm() should
292 * be ignored. They do not signify the lack of any more terms.
293 * endOfDocument() should be used to check that.
294 * @return String the next term of the document. Null returns should be
295 * ignored.
296 */
297 public String getNextTerm()
298 {
299 return this.tokenizer.next();
300 }
301 /** getNextTerm() **/
302
303 /** Allows access to a named property of the Document. Examples might be URL,
304 * filename etc.
305 * @param name Name of the property. It is suggested, but not required that
306 * this name should not be case insensitive.
307 * @since 1.1.0
308 */
309 public String getProperty(String name)
310 {
311 return this.properties.get(name.toLowerCase());
312 }
313 /** getProperty(String name) **/
314
315 /** Returns a Reader object so client code can tokenise the document
316 * or deal with the document itself. Examples might be extracting URLs,
317 * language detection. */
318 public Reader getReader()
319 {
320 return this.reader;
321 }
322 /** getReader() **/
323
324 /**
325 */
326 private String generateHash(String string)
327 {
328 StringBuffer sb = new StringBuffer();
329 try
330 {
331 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
332 message_digest.reset();
333 message_digest.update(string.getBytes(Charset.forName("UTF8")));
334 final byte[] result_bytes = message_digest.digest();
335 for (int i = 0; i < result_bytes.length; ++i)
336 {
337 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
338 }
339 }
340 catch (NoSuchAlgorithmException e)
341 {
342 System.err.println("Exception: " + e);
343 System.exit(0);
344 }
345 return sb.toString();
346 }
347 /** generateHash(String) **/
348}
349
Note: See TracBrowser for help on using the repository browser.