source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26186

Last change on this file since 26186 was 26186, checked in by jmt12, 12 years ago

Adding in (optional) support for video and image processing in DSpace and Terrier. These kinda belong here as they depend on the video-and-audio support (like MediaInfo, HandbrakeCLI, and Hive2) to work

File size: 10.1 KB
Line 
1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.BufferedOutputStream;
20import java.io.BufferedReader;
21import java.io.FileOutputStream;
22import java.io.InputStream;
23import java.io.InputStreamReader;
24import java.io.IOException;
25import java.io.PrintWriter;
26import java.io.StringReader;
27import java.io.Reader;
28import java.lang.Thread;
29import java.nio.file.Files;
30import java.nio.file.Path;
31import java.nio.file.Paths;
32import java.util.Collections;
33import java.util.Arrays;
34import java.util.Map;
35import java.util.Set;
36
37import org.apache.log4j.Logger;
38import org.terrier.indexing.tokenisation.TokenStream;
39import org.terrier.indexing.tokenisation.Tokeniser;
40import org.terrier.utility.ApplicationSetup;
41
42public class ImageDocument
43 implements Document
44{
45 /** A reference to the logger for messaging */
46 protected static final Logger logger = Logger.getLogger(FileDocument.class);
47 /** The map of properties (fields) for this document. */
48 protected Map<String,String> properties;
49 /** A reader built from a dummy text string. */
50 protected Reader reader;
51 /** A token stream produced by the configured tokeniser when feed the dummy
52 * reader.
53 */
54 protected TokenStream tokenizer;
55
56 /** The preview filetype. **/
57 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
58 /** The preview size (width). **/
59 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
60
61 /** Default constructor. **/
62 protected ImageDocument() {}
63
64 /** Constructs an instance of the ImageDocument from the given input stream.
65 * @param docStream the input stream that reads the file.
66 * @param docProperties the initial properties (docno, filename)
67 * @param tok the tokeniser defined for this collection
68 */
69 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
70 {
71 logger.info("ImageDocument::ImageDocument()");
72 // Initialization from arguments
73 this.properties = default_properties;
74
75 // Set properties
76 logger.info("ImageDocument - extracting properties");
77 // A. Hardcoded properties
78 this.properties.put("parser", "ImageDocument");
79 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
80 // B. Properties derived from filename
81 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
82 this.properties.put("title", title);
83 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
84 String target_filename = "doc." + ext;
85 this.properties.put("source","doc." + ext);
86 String assoc_filename = "D" + properties.get("docno");
87 this.properties.put("assocfile", assoc_filename);
88
89 // Copy (symlink) the file into place in the shared directory
90 Path source_path = Paths.get(properties.get("filename"));
91 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
92 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
93 Path target_path = assoc_path.resolve(target_filename);
94 if (target_path.toFile().exists())
95 {
96 logger.info("ImageDocument - removing existing (old) associated image");
97 try
98 {
99 Files.delete(target_path);
100 }
101 catch (Exception e)
102 {
103 logger.error("Exception while deleting old image: ", e);
104 }
105 }
106 logger.info("ImageDocument - symlinking image into assoc directory");
107 try
108 {
109 Files.createSymbolicLink(target_path, source_path);
110 }
111 // not supported? We'll try copying below
112 catch (UnsupportedOperationException ex)
113 {
114 }
115 // All other exceptions can be fatal
116 catch (Exception e)
117 {
118 logger.error("Exception while symlinking image: ", e);
119 }
120 // - copy if the file doesn't exist yet
121 if (!target_path.toFile().exists())
122 {
123 logger.info("ImageDocument - symlink filaed, copying instead");
124 try
125 {
126 Files.copy(source_path, target_path);
127 }
128 // Fatality!
129 catch (Exception e)
130 {
131 logger.error("Exception while copying image: ", e);
132 }
133 }
134
135 // Generate preview image
136 logger.info("ImageDocument - generate preview image");
137 try
138 {
139 String preview_filename = this.generatePreview(source_path, assoc_path);
140 this.properties.put("preview",preview_filename);
141 }
142 catch (Exception e)
143 {
144 logger.error("Exception while generating preview image: ", e);
145 }
146
147 // Create a dummy reader around some dummy text and then tokenize it
148 logger.info("ImageDocument - feed dummy text as token stream to indexer");
149 try
150 {
151 this.reader = new StringReader(this.properties.get("abstract"));
152 this.tokenizer = tok.tokenise(this.reader);
153 }
154 catch (Exception e)
155 {
156 logger.error("Exception while creating dummy text stream: ", e);
157 }
158 logger.info("ImageDocument - Complete!");
159 }
160 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
161
162 /** Returns true when the end of the document has been reached, and there
163 * are no other terms to be retrieved from it.
164 * @return boolean true if there are no more terms in the document, otherwise
165 * it returns false.
166 */
167 public boolean endOfDocument()
168 {
169 return !this.tokenizer.hasNext();
170 }
171 /** endOfDocument() **/
172
173 /** Use ImageMagick to generate a preview image.
174 * @pre assumes you have ImageMagick installed and available on Path
175 * @pre uses member variables preview_format and preview_width
176 * @return the filename of the preview image (within the assoc directory)
177 */
178 private String generatePreview(Path source_path, Path assoc_path)
179 throws Exception
180 {
181 String preview_filename = "preview." + this.preview_format;
182 Path preview_path = assoc_path.resolve(preview_filename);
183 String convert_command[] = {
184 "convert",
185 source_path.toString(),
186 "-resize",
187 this.preview_width + "x",
188 preview_path.toString()
189 };
190 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
191 Process convert_process = Runtime.getRuntime().exec(convert_command);
192 // Gobble up the streams to prevent them hanging the process when buffers
193 // are full
194 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
195 convert_process_error_gobbler.start();
196 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
197 convert_process_input_gobbler.start();
198 // Let the conversion finish
199 int convert_status = convert_process.waitFor();
200 if (convert_status != 0 || !preview_path.toFile().exists())
201 {
202 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
203 }
204 return preview_filename;
205 }
206 /** generatePreview(Path, Path) **/
207
208 /** Returns the underlying map of all the properties defined by this Document.
209 * @since 1.1.0
210 */
211 public Map<String,String> getAllProperties()
212 {
213 return this.properties;
214 }
215 /** getAllProperties() **/
216
217 /** Returns a list of the fields the current term appears in.
218 * @return HashSet a set of the terms that the current term appears in.
219 */
220 public Set<String> getFields()
221 {
222 // Returns null because there is no support for fields with file documents.
223 return Collections.emptySet();
224 }
225 /** getFields() **/
226
227 /** Gets the next term of the document.
228 * <B>NB:</B>Null string returned from getNextTerm() should
229 * be ignored. They do not signify the lack of any more terms.
230 * endOfDocument() should be used to check that.
231 * @return String the next term of the document. Null returns should be
232 * ignored.
233 */
234 public String getNextTerm()
235 {
236 return this.tokenizer.next();
237 }
238 /** getNextTerm() **/
239
240 /** Allows access to a named property of the Document. Examples might be URL,
241 * filename etc.
242 * @param name Name of the property. It is suggested, but not required that
243 * this name should not be case insensitive.
244 * @since 1.1.0
245 */
246 public String getProperty(String name)
247 {
248 return this.properties.get(name.toLowerCase());
249 }
250 /** getProperty(String name) **/
251
252 /** Returns a Reader object so client code can tokenise the document
253 * or deal with the document itself. Examples might be extracting URLs,
254 * language detection. */
255 public Reader getReader()
256 {
257 return this.reader;
258 }
259 /** getReader() **/
260}
261
262class StreamGobbler
263extends Thread
264{
265 InputStream is;
266 String file_path;
267 boolean output_to_file;
268
269 StreamGobbler(InputStream is)
270 {
271 this.is = is;
272 this.output_to_file = false;
273 }
274
275 StreamGobbler(InputStream is, String file_path)
276 {
277 this.is = is;
278 this.file_path = file_path;
279 this.output_to_file = true;
280 }
281
282 public void run()
283 {
284 try
285 {
286 InputStreamReader isr = new InputStreamReader(is);
287 BufferedReader br = new BufferedReader(isr);
288 String line = null;
289 if (output_to_file)
290 {
291 PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(file_path)));
292 while ( (line = br.readLine()) != null)
293 {
294 pw.println(line);
295 }
296 pw.flush();
297 pw.close();
298 }
299 else
300 {
301 while ( (line = br.readLine()) != null)
302 {
303 // Do nothing - equivalent to > /dev/null
304 }
305 }
306 }
307 catch (IOException ioe)
308 {
309 ioe.printStackTrace();
310 }
311 }
312}
Note: See TracBrowser for help on using the repository browser.