source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26208

Last change on this file since 26208 was 26208, checked in by jmt12, 12 years ago

Printing out document properties at start of 'plugin' pass

File size: 9.3 KB
Line 
1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.file.Files;
23import java.nio.file.Path;
24import java.nio.file.Paths;
25import java.util.Collections;
26import java.util.Arrays;
27import java.util.Map;
28import java.util.Set;
29
30import org.apache.log4j.Logger;
31import org.terrier.indexing.StreamGobbler;
32import org.terrier.indexing.tokenisation.TokenStream;
33import org.terrier.indexing.tokenisation.Tokeniser;
34import org.terrier.utility.ApplicationSetup;
35
36public class ImageDocument
37 implements Document
38{
39 /** A reference to the logger for messaging */
40 protected static final Logger logger = Logger.getLogger(FileDocument.class);
41 /** The map of properties (fields) for this document. */
42 protected Map<String,String> properties;
43 /** A reader built from a dummy text string. */
44 protected Reader reader;
45 /** A token stream produced by the configured tokeniser when feed the dummy
46 * reader.
47 */
48 protected TokenStream tokenizer;
49
50 /** The preview filetype. **/
51 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
52 /** The preview size (width). **/
53 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
54
55 /** Default constructor. **/
56 protected ImageDocument() {}
57
58 /** Constructs an instance of the ImageDocument from the given input stream.
59 * @param docStream the input stream that reads the file.
60 * @param docProperties the initial properties (docno, filename)
61 * @param tok the tokeniser defined for this collection
62 */
63 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
64 {
65 logger.info("ImageDocument::ImageDocument()");
66 // Initialization from arguments
67 this.properties = default_properties;
68
69 // Set properties
70 logger.info("ImageDocument - current properties");
71 for (Map.Entry<String, String> entry : this.properties.entrySet())
72 {
73 logger.info(entry.getKey() + "=" + entry.getValue());
74 }
75
76 logger.info("ImageDocument - extracting properties");
77 // A. Hardcoded properties
78 this.properties.put("parser", "ImageDocument");
79 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
80 // B. Properties derived from filename
81 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
82 this.properties.put("title", title);
83 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
84 String target_filename = "doc." + ext;
85 this.properties.put("source","doc." + ext);
86 String assoc_filename = "D" + this.properties.get("docno");
87 if (assoc_filename.equals("Dnull"))
88 {
89 System.err.println("Error! Bogus assoc dir: " + this.properties.get("docno"));
90 System.exit(0);
91 }
92
93 this.properties.put("assocfile", assoc_filename);
94
95 // Copy (symlink) the file into place in the shared directory
96 Path source_path = Paths.get(properties.get("filename"));
97 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
98 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
99 Path target_path = assoc_path.resolve(target_filename);
100 if (target_path.toFile().exists())
101 {
102 logger.info("ImageDocument - removing existing (old) associated image");
103 try
104 {
105 Files.delete(target_path);
106 }
107 catch (Exception e)
108 {
109 logger.error("Exception while deleting old image: ", e);
110 }
111 }
112 logger.info("ImageDocument - symlinking image into assoc directory");
113 try
114 {
115 Files.createSymbolicLink(target_path, source_path);
116 }
117 // not supported? We'll try copying below
118 catch (UnsupportedOperationException ex)
119 {
120 }
121 // All other exceptions can be fatal
122 catch (Exception e)
123 {
124 logger.error("Exception while symlinking image: ", e);
125 }
126 // - copy if the file doesn't exist yet
127 if (!target_path.toFile().exists())
128 {
129 logger.info("ImageDocument - symlink filaed, copying instead");
130 try
131 {
132 Files.copy(source_path, target_path);
133 }
134 // Fatality!
135 catch (Exception e)
136 {
137 logger.error("Exception while copying image: ", e);
138 }
139 }
140
141 // Generate preview image
142 logger.info("ImageDocument - generate preview image");
143 try
144 {
145 String preview_filename = this.generatePreview(source_path, assoc_path);
146 this.properties.put("preview",preview_filename);
147 }
148 catch (Exception e)
149 {
150 logger.error("Exception while generating preview image: ", e);
151 }
152
153 // Create a dummy reader around some dummy text and then tokenize it
154 logger.info("ImageDocument - feed dummy text as token stream to indexer");
155 try
156 {
157 this.reader = new StringReader(this.properties.get("abstract"));
158 this.tokenizer = tok.tokenise(this.reader);
159 }
160 catch (Exception e)
161 {
162 logger.error("Exception while creating dummy text stream: ", e);
163 }
164 logger.info("ImageDocument - Complete!");
165 }
166 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
167
168 /** Returns true when the end of the document has been reached, and there
169 * are no other terms to be retrieved from it.
170 * @return boolean true if there are no more terms in the document, otherwise
171 * it returns false.
172 */
173 public boolean endOfDocument()
174 {
175 return !this.tokenizer.hasNext();
176 }
177 /** endOfDocument() **/
178
179 /** Use ImageMagick to generate a preview image.
180 * @pre assumes you have ImageMagick installed and available on Path
181 * @pre uses member variables preview_format and preview_width
182 * @return the filename of the preview image (within the assoc directory)
183 */
184 private String generatePreview(Path source_path, Path assoc_path)
185 throws Exception
186 {
187 String preview_filename = "preview." + this.preview_format;
188 Path preview_path = assoc_path.resolve(preview_filename);
189 String convert_command[] = {
190 "convert",
191 source_path.toString(),
192 "-resize",
193 this.preview_width + "x",
194 preview_path.toString()
195 };
196 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
197 Process convert_process = Runtime.getRuntime().exec(convert_command);
198 // Gobble up the streams to prevent them hanging the process when buffers
199 // are full
200 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
201 convert_process_error_gobbler.start();
202 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
203 convert_process_input_gobbler.start();
204 // Let the conversion finish
205 int convert_status = convert_process.waitFor();
206 if (convert_status != 0 || !preview_path.toFile().exists())
207 {
208 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
209 }
210 return preview_filename;
211 }
212 /** generatePreview(Path, Path) **/
213
214 /** Returns the underlying map of all the properties defined by this Document.
215 * @since 1.1.0
216 */
217 public Map<String,String> getAllProperties()
218 {
219 return this.properties;
220 }
221 /** getAllProperties() **/
222
223 /** Returns a list of the fields the current term appears in.
224 * @return HashSet a set of the terms that the current term appears in.
225 */
226 public Set<String> getFields()
227 {
228 // Returns null because there is no support for fields with file documents.
229 return Collections.emptySet();
230 }
231 /** getFields() **/
232
233 /** Gets the next term of the document.
234 * <B>NB:</B>Null string returned from getNextTerm() should
235 * be ignored. They do not signify the lack of any more terms.
236 * endOfDocument() should be used to check that.
237 * @return String the next term of the document. Null returns should be
238 * ignored.
239 */
240 public String getNextTerm()
241 {
242 return this.tokenizer.next();
243 }
244 /** getNextTerm() **/
245
246 /** Allows access to a named property of the Document. Examples might be URL,
247 * filename etc.
248 * @param name Name of the property. It is suggested, but not required that
249 * this name should not be case insensitive.
250 * @since 1.1.0
251 */
252 public String getProperty(String name)
253 {
254 return this.properties.get(name.toLowerCase());
255 }
256 /** getProperty(String name) **/
257
258 /** Returns a Reader object so client code can tokenise the document
259 * or deal with the document itself. Examples might be extracting URLs,
260 * language detection. */
261 public Reader getReader()
262 {
263 return this.reader;
264 }
265 /** getReader() **/
266}
267
Note: See TracBrowser for help on using the repository browser.