source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26214

Last change on this file since 26214 was 26214, checked in by jmt12, 12 years ago

New hash based generation for associated files directory - so docno is no longer essential

File size: 11.2 KB
Line 
1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.charset.Charset;
23import java.nio.file.Files;
24import java.nio.file.Path;
25import java.nio.file.Paths;
26import java.security.MessageDigest;
27import java.security.NoSuchAlgorithmException;
28import java.util.Collections;
29import java.util.Arrays;
30import java.util.Map;
31import java.util.Set;
32
33import org.apache.log4j.Logger;
34import org.terrier.indexing.StreamGobbler;
35import org.terrier.indexing.tokenisation.TokenStream;
36import org.terrier.indexing.tokenisation.Tokeniser;
37import org.terrier.utility.ApplicationSetup;
38
39public class ImageDocument
40 implements Document
41{
42 /** A reference to the logger for messaging */
43 protected static final Logger logger = Logger.getLogger(FileDocument.class);
44 /** The map of properties (fields) for this document. */
45 protected Map<String,String> properties;
46 /** A reader built from a dummy text string. */
47 protected Reader reader;
48 /** A token stream produced by the configured tokeniser when feed the dummy
49 * reader.
50 */
51 protected TokenStream tokenizer;
52
53 /** The preview filetype. **/
54 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
55 /** The preview size (width). **/
56 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
57
58 /** Default constructor. **/
59 protected ImageDocument() {}
60
61 /** Constructs an instance of the ImageDocument from the given input stream.
62 * @param docStream the input stream that reads the file.
63 * @param docProperties the initial properties (docno, filename)
64 * @param tok the tokeniser defined for this collection
65 */
66 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
67 {
68 logger.info("ImageDocument::ImageDocument()");
69 // Initialization from arguments
70 this.properties = default_properties;
71
72 // Set properties
73 logger.info("ImageDocument - current properties");
74 for (Map.Entry<String, String> entry : this.properties.entrySet())
75 {
76 logger.info(entry.getKey() + "=" + entry.getValue());
77 }
78
79 logger.info("ImageDocument - extracting properties");
80 // A. Hardcoded properties
81 this.properties.put("parser", "ImageDocument");
82 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
83 // B. Properties derived from filename
84 // - A simple title for the document
85 String filepath = this.properties.get("filename");
86 String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
87 this.properties.put("title", title);
88 String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
89 // - The name of the copy of the original document
90 String target_filename = "doc." + ext;
91 this.properties.put("source","doc." + ext);
92 // - A unique associated directory. This gets a little tricky as we need
93 // to create the directory at the same time if an effort to promote
94 // synchronous behaviour
95 String unique_id = this.generateHash(filepath);
96 // - we start with the first 4 characters
97 int offset = 0;
98 String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
99 // - we add ".dir" as a suffix to the directory that actually contains
100 // files (so the non-suffixed version contains nested directories)
101 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
102 // - then we continue adding blocks of 4 characters until we get a
103 // directory that doesn't already exist
104 while (assoc_path.toFile().exists() && offset < unique_id.length())
105 {
106 offset += 4;
107 assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
108 assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
109 }
110 // - still not unique? but run out of unique_id... time to complain
111 if (assoc_path.toFile().exists())
112 {
113 logger.error("ImageDoument - can't determine unique assocfilepath");
114 System.exit(0);
115 }
116 // - create the directories quick... hopefully before someone else does
117 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
118 this.properties.put("assocfile", assoc_filename);
119
120 // Copy (symlink) the file into place in the shared directory
121 Path source_path = Paths.get(properties.get("filename"));
122 Path target_path = assoc_path.resolve(target_filename);
123 if (target_path.toFile().exists())
124 {
125 logger.info("ImageDocument - removing existing (old) associated image");
126 try
127 {
128 Files.delete(target_path);
129 }
130 catch (Exception e)
131 {
132 logger.error("Exception while deleting old image: ", e);
133 }
134 }
135 logger.info("ImageDocument - symlinking image into assoc directory");
136 try
137 {
138 Files.createSymbolicLink(target_path, source_path);
139 }
140 // not supported? We'll try copying below
141 catch (UnsupportedOperationException ex)
142 {
143 }
144 // All other exceptions can be fatal
145 catch (Exception e)
146 {
147 logger.error("Exception while symlinking image: ", e);
148 }
149 // - copy if the file doesn't exist yet
150 if (!target_path.toFile().exists())
151 {
152 logger.info("ImageDocument - symlink filaed, copying instead");
153 try
154 {
155 Files.copy(source_path, target_path);
156 }
157 // Fatality!
158 catch (Exception e)
159 {
160 logger.error("Exception while copying image: ", e);
161 }
162 }
163
164 // Generate preview image
165 logger.info("ImageDocument - generate preview image");
166 try
167 {
168 String preview_filename = this.generatePreview(source_path, assoc_path);
169 this.properties.put("preview",preview_filename);
170 }
171 catch (Exception e)
172 {
173 logger.error("Exception while generating preview image: ", e);
174 }
175
176 // Create a dummy reader around some dummy text and then tokenize it
177 logger.info("ImageDocument - feed dummy text as token stream to indexer");
178 try
179 {
180 this.reader = new StringReader(this.properties.get("abstract"));
181 this.tokenizer = tok.tokenise(this.reader);
182 }
183 catch (Exception e)
184 {
185 logger.error("Exception while creating dummy text stream: ", e);
186 }
187 logger.info("ImageDocument - Complete!");
188 }
189 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
190
191 /** Returns true when the end of the document has been reached, and there
192 * are no other terms to be retrieved from it.
193 * @return boolean true if there are no more terms in the document, otherwise
194 * it returns false.
195 */
196 public boolean endOfDocument()
197 {
198 return !this.tokenizer.hasNext();
199 }
200 /** endOfDocument() **/
201
202 /** Use ImageMagick to generate a preview image.
203 * @pre assumes you have ImageMagick installed and available on Path
204 * @pre uses member variables preview_format and preview_width
205 * @return the filename of the preview image (within the assoc directory)
206 */
207 private String generatePreview(Path source_path, Path assoc_path)
208 throws Exception
209 {
210 String preview_filename = "preview." + this.preview_format;
211 Path preview_path = assoc_path.resolve(preview_filename);
212 String convert_command[] = {
213 "convert",
214 source_path.toString(),
215 "-resize",
216 this.preview_width + "x",
217 preview_path.toString()
218 };
219 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
220 Process convert_process = Runtime.getRuntime().exec(convert_command);
221 // Gobble up the streams to prevent them hanging the process when buffers
222 // are full
223 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
224 convert_process_error_gobbler.start();
225 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
226 convert_process_input_gobbler.start();
227 // Let the conversion finish
228 int convert_status = convert_process.waitFor();
229 if (convert_status != 0 || !preview_path.toFile().exists())
230 {
231 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
232 }
233 return preview_filename;
234 }
235 /** generatePreview(Path, Path) **/
236
237 /** Returns the underlying map of all the properties defined by this Document.
238 * @since 1.1.0
239 */
240 public Map<String,String> getAllProperties()
241 {
242 return this.properties;
243 }
244 /** getAllProperties() **/
245
246 /** Returns a list of the fields the current term appears in.
247 * @return HashSet a set of the terms that the current term appears in.
248 */
249 public Set<String> getFields()
250 {
251 // Returns null because there is no support for fields with file documents.
252 return Collections.emptySet();
253 }
254 /** getFields() **/
255
256 /** Gets the next term of the document.
257 * <B>NB:</B>Null string returned from getNextTerm() should
258 * be ignored. They do not signify the lack of any more terms.
259 * endOfDocument() should be used to check that.
260 * @return String the next term of the document. Null returns should be
261 * ignored.
262 */
263 public String getNextTerm()
264 {
265 return this.tokenizer.next();
266 }
267 /** getNextTerm() **/
268
269 /** Allows access to a named property of the Document. Examples might be URL,
270 * filename etc.
271 * @param name Name of the property. It is suggested, but not required that
272 * this name should not be case insensitive.
273 * @since 1.1.0
274 */
275 public String getProperty(String name)
276 {
277 return this.properties.get(name.toLowerCase());
278 }
279 /** getProperty(String name) **/
280
281 /** Returns a Reader object so client code can tokenise the document
282 * or deal with the document itself. Examples might be extracting URLs,
283 * language detection. */
284 public Reader getReader()
285 {
286 return this.reader;
287 }
288 /** getReader() **/
289
290 /**
291 */
292 private String generateHash(String string)
293 {
294 StringBuffer sb = new StringBuffer();
295 try
296 {
297 final MessageDigest message_digest = MessageDigest.getInstance("MD5");
298 message_digest.reset();
299 message_digest.update(string.getBytes(Charset.forName("UTF8")));
300 final byte[] result_bytes = message_digest.digest();
301 for (int i = 0; i < result_bytes.length; ++i)
302 {
303 sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3));
304 }
305 }
306 catch (NoSuchAlgorithmException e)
307 {
308 System.err.println("Exception: " + e);
309 System.exit(0);
310 }
311 return sb.toString();
312 }
313 /** generateHash(String) **/
314}
315
Note: See TracBrowser for help on using the repository browser.