source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26191

Last change on this file since 26191 was 26190, checked in by jmt12, 12 years ago

Moving the StreamGobbler - used in both plugins to prevent a full STDERR buffer killing the import - into it's own class... my computer doesn't have an issue with exactly the same class occuring twice, but Medusa's one seems stricter in this regard

File size: 8.9 KB
Line 
1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.file.Files;
23import java.nio.file.Path;
24import java.nio.file.Paths;
25import java.util.Collections;
26import java.util.Arrays;
27import java.util.Map;
28import java.util.Set;
29
30import org.apache.log4j.Logger;
31import org.terrier.indexing.StreamGobbler;
32import org.terrier.indexing.tokenisation.TokenStream;
33import org.terrier.indexing.tokenisation.Tokeniser;
34import org.terrier.utility.ApplicationSetup;
35
36public class ImageDocument
37 implements Document
38{
39 /** A reference to the logger for messaging */
40 protected static final Logger logger = Logger.getLogger(FileDocument.class);
41 /** The map of properties (fields) for this document. */
42 protected Map<String,String> properties;
43 /** A reader built from a dummy text string. */
44 protected Reader reader;
45 /** A token stream produced by the configured tokeniser when feed the dummy
46 * reader.
47 */
48 protected TokenStream tokenizer;
49
50 /** The preview filetype. **/
51 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
52 /** The preview size (width). **/
53 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
54
55 /** Default constructor. **/
56 protected ImageDocument() {}
57
58 /** Constructs an instance of the ImageDocument from the given input stream.
59 * @param docStream the input stream that reads the file.
60 * @param docProperties the initial properties (docno, filename)
61 * @param tok the tokeniser defined for this collection
62 */
63 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
64 {
65 logger.info("ImageDocument::ImageDocument()");
66 // Initialization from arguments
67 this.properties = default_properties;
68
69 // Set properties
70 logger.info("ImageDocument - extracting properties");
71 // A. Hardcoded properties
72 this.properties.put("parser", "ImageDocument");
73 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
74 // B. Properties derived from filename
75 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
76 this.properties.put("title", title);
77 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
78 String target_filename = "doc." + ext;
79 this.properties.put("source","doc." + ext);
80 String assoc_filename = "D" + properties.get("docno");
81 this.properties.put("assocfile", assoc_filename);
82
83 // Copy (symlink) the file into place in the shared directory
84 Path source_path = Paths.get(properties.get("filename"));
85 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
86 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
87 Path target_path = assoc_path.resolve(target_filename);
88 if (target_path.toFile().exists())
89 {
90 logger.info("ImageDocument - removing existing (old) associated image");
91 try
92 {
93 Files.delete(target_path);
94 }
95 catch (Exception e)
96 {
97 logger.error("Exception while deleting old image: ", e);
98 }
99 }
100 logger.info("ImageDocument - symlinking image into assoc directory");
101 try
102 {
103 Files.createSymbolicLink(target_path, source_path);
104 }
105 // not supported? We'll try copying below
106 catch (UnsupportedOperationException ex)
107 {
108 }
109 // All other exceptions can be fatal
110 catch (Exception e)
111 {
112 logger.error("Exception while symlinking image: ", e);
113 }
114 // - copy if the file doesn't exist yet
115 if (!target_path.toFile().exists())
116 {
117 logger.info("ImageDocument - symlink filaed, copying instead");
118 try
119 {
120 Files.copy(source_path, target_path);
121 }
122 // Fatality!
123 catch (Exception e)
124 {
125 logger.error("Exception while copying image: ", e);
126 }
127 }
128
129 // Generate preview image
130 logger.info("ImageDocument - generate preview image");
131 try
132 {
133 String preview_filename = this.generatePreview(source_path, assoc_path);
134 this.properties.put("preview",preview_filename);
135 }
136 catch (Exception e)
137 {
138 logger.error("Exception while generating preview image: ", e);
139 }
140
141 // Create a dummy reader around some dummy text and then tokenize it
142 logger.info("ImageDocument - feed dummy text as token stream to indexer");
143 try
144 {
145 this.reader = new StringReader(this.properties.get("abstract"));
146 this.tokenizer = tok.tokenise(this.reader);
147 }
148 catch (Exception e)
149 {
150 logger.error("Exception while creating dummy text stream: ", e);
151 }
152 logger.info("ImageDocument - Complete!");
153 }
154 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
155
156 /** Returns true when the end of the document has been reached, and there
157 * are no other terms to be retrieved from it.
158 * @return boolean true if there are no more terms in the document, otherwise
159 * it returns false.
160 */
161 public boolean endOfDocument()
162 {
163 return !this.tokenizer.hasNext();
164 }
165 /** endOfDocument() **/
166
167 /** Use ImageMagick to generate a preview image.
168 * @pre assumes you have ImageMagick installed and available on Path
169 * @pre uses member variables preview_format and preview_width
170 * @return the filename of the preview image (within the assoc directory)
171 */
172 private String generatePreview(Path source_path, Path assoc_path)
173 throws Exception
174 {
175 String preview_filename = "preview." + this.preview_format;
176 Path preview_path = assoc_path.resolve(preview_filename);
177 String convert_command[] = {
178 "convert",
179 source_path.toString(),
180 "-resize",
181 this.preview_width + "x",
182 preview_path.toString()
183 };
184 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
185 Process convert_process = Runtime.getRuntime().exec(convert_command);
186 // Gobble up the streams to prevent them hanging the process when buffers
187 // are full
188 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
189 convert_process_error_gobbler.start();
190 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
191 convert_process_input_gobbler.start();
192 // Let the conversion finish
193 int convert_status = convert_process.waitFor();
194 if (convert_status != 0 || !preview_path.toFile().exists())
195 {
196 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
197 }
198 return preview_filename;
199 }
200 /** generatePreview(Path, Path) **/
201
202 /** Returns the underlying map of all the properties defined by this Document.
203 * @since 1.1.0
204 */
205 public Map<String,String> getAllProperties()
206 {
207 return this.properties;
208 }
209 /** getAllProperties() **/
210
211 /** Returns a list of the fields the current term appears in.
212 * @return HashSet a set of the terms that the current term appears in.
213 */
214 public Set<String> getFields()
215 {
216 // Returns null because there is no support for fields with file documents.
217 return Collections.emptySet();
218 }
219 /** getFields() **/
220
221 /** Gets the next term of the document.
222 * <B>NB:</B>Null string returned from getNextTerm() should
223 * be ignored. They do not signify the lack of any more terms.
224 * endOfDocument() should be used to check that.
225 * @return String the next term of the document. Null returns should be
226 * ignored.
227 */
228 public String getNextTerm()
229 {
230 return this.tokenizer.next();
231 }
232 /** getNextTerm() **/
233
234 /** Allows access to a named property of the Document. Examples might be URL,
235 * filename etc.
236 * @param name Name of the property. It is suggested, but not required that
237 * this name should not be case insensitive.
238 * @since 1.1.0
239 */
240 public String getProperty(String name)
241 {
242 return this.properties.get(name.toLowerCase());
243 }
244 /** getProperty(String name) **/
245
246 /** Returns a Reader object so client code can tokenise the document
247 * or deal with the document itself. Examples might be extracting URLs,
248 * language detection. */
249 public Reader getReader()
250 {
251 return this.reader;
252 }
253 /** getReader() **/
254}
255
Note: See TracBrowser for help on using the repository browser.