source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26207

Last change on this file since 26207 was 26207, checked in by jmt12, 12 years ago

Sanity check trying to track down bogus docno value

File size: 9.1 KB
RevLine 
[26186]1/**
2 * Adding support for Images in Terrier
3 * @author: John Thompson, jmt12, #9826509
4 *
5 * The contents of this file are subject to the Mozilla Public License
6 * Version 1.1 (the "License"); you may not use this file except in
7 * compliance with the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS"
11 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 * the License for the specific language governing rights and limitations
13 * under the License.
14 *
15 * Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.file.Files;
23import java.nio.file.Path;
24import java.nio.file.Paths;
25import java.util.Collections;
26import java.util.Arrays;
27import java.util.Map;
28import java.util.Set;
29
30import org.apache.log4j.Logger;
[26190]31import org.terrier.indexing.StreamGobbler;
[26186]32import org.terrier.indexing.tokenisation.TokenStream;
33import org.terrier.indexing.tokenisation.Tokeniser;
34import org.terrier.utility.ApplicationSetup;
35
36public class ImageDocument
37 implements Document
38{
39 /** A reference to the logger for messaging */
40 protected static final Logger logger = Logger.getLogger(FileDocument.class);
41 /** The map of properties (fields) for this document. */
42 protected Map<String,String> properties;
43 /** A reader built from a dummy text string. */
44 protected Reader reader;
45 /** A token stream produced by the configured tokeniser when feed the dummy
46 * reader.
47 */
48 protected TokenStream tokenizer;
49
50 /** The preview filetype. **/
51 protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
52 /** The preview size (width). **/
53 protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
54
55 /** Default constructor. **/
56 protected ImageDocument() {}
57
58 /** Constructs an instance of the ImageDocument from the given input stream.
59 * @param docStream the input stream that reads the file.
60 * @param docProperties the initial properties (docno, filename)
61 * @param tok the tokeniser defined for this collection
62 */
63 public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
64 {
65 logger.info("ImageDocument::ImageDocument()");
66 // Initialization from arguments
67 this.properties = default_properties;
68
69 // Set properties
70 logger.info("ImageDocument - extracting properties");
71 // A. Hardcoded properties
72 this.properties.put("parser", "ImageDocument");
73 this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
74 // B. Properties derived from filename
75 String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
76 this.properties.put("title", title);
77 String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
78 String target_filename = "doc." + ext;
79 this.properties.put("source","doc." + ext);
[26207]80 String assoc_filename = "D" + this.properties.get("docno");
81 if (assoc_filename.equals("Dnull"))
82 {
83 System.err.println("Error! Bogus assoc dir: " + this.properties.get("docno"));
84 System.exit(0);
85 }
86
[26186]87 this.properties.put("assocfile", assoc_filename);
88
89 // Copy (symlink) the file into place in the shared directory
90 Path source_path = Paths.get(properties.get("filename"));
91 Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
92 assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
93 Path target_path = assoc_path.resolve(target_filename);
94 if (target_path.toFile().exists())
95 {
96 logger.info("ImageDocument - removing existing (old) associated image");
97 try
98 {
99 Files.delete(target_path);
100 }
101 catch (Exception e)
102 {
103 logger.error("Exception while deleting old image: ", e);
104 }
105 }
106 logger.info("ImageDocument - symlinking image into assoc directory");
107 try
108 {
109 Files.createSymbolicLink(target_path, source_path);
110 }
111 // not supported? We'll try copying below
112 catch (UnsupportedOperationException ex)
113 {
114 }
115 // All other exceptions can be fatal
116 catch (Exception e)
117 {
118 logger.error("Exception while symlinking image: ", e);
119 }
120 // - copy if the file doesn't exist yet
121 if (!target_path.toFile().exists())
122 {
123 logger.info("ImageDocument - symlink filaed, copying instead");
124 try
125 {
126 Files.copy(source_path, target_path);
127 }
128 // Fatality!
129 catch (Exception e)
130 {
131 logger.error("Exception while copying image: ", e);
132 }
133 }
134
135 // Generate preview image
136 logger.info("ImageDocument - generate preview image");
137 try
138 {
139 String preview_filename = this.generatePreview(source_path, assoc_path);
140 this.properties.put("preview",preview_filename);
141 }
142 catch (Exception e)
143 {
144 logger.error("Exception while generating preview image: ", e);
145 }
146
147 // Create a dummy reader around some dummy text and then tokenize it
148 logger.info("ImageDocument - feed dummy text as token stream to indexer");
149 try
150 {
151 this.reader = new StringReader(this.properties.get("abstract"));
152 this.tokenizer = tok.tokenise(this.reader);
153 }
154 catch (Exception e)
155 {
156 logger.error("Exception while creating dummy text stream: ", e);
157 }
158 logger.info("ImageDocument - Complete!");
159 }
160 /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
161
162 /** Returns true when the end of the document has been reached, and there
163 * are no other terms to be retrieved from it.
164 * @return boolean true if there are no more terms in the document, otherwise
165 * it returns false.
166 */
167 public boolean endOfDocument()
168 {
169 return !this.tokenizer.hasNext();
170 }
171 /** endOfDocument() **/
172
173 /** Use ImageMagick to generate a preview image.
174 * @pre assumes you have ImageMagick installed and available on Path
175 * @pre uses member variables preview_format and preview_width
176 * @return the filename of the preview image (within the assoc directory)
177 */
178 private String generatePreview(Path source_path, Path assoc_path)
179 throws Exception
180 {
181 String preview_filename = "preview." + this.preview_format;
182 Path preview_path = assoc_path.resolve(preview_filename);
183 String convert_command[] = {
184 "convert",
185 source_path.toString(),
186 "-resize",
187 this.preview_width + "x",
188 preview_path.toString()
189 };
190 logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
191 Process convert_process = Runtime.getRuntime().exec(convert_command);
192 // Gobble up the streams to prevent them hanging the process when buffers
193 // are full
194 StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
195 convert_process_error_gobbler.start();
196 StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
197 convert_process_input_gobbler.start();
198 // Let the conversion finish
199 int convert_status = convert_process.waitFor();
200 if (convert_status != 0 || !preview_path.toFile().exists())
201 {
202 throw new Exception("Convert command failed (exit status: " + convert_status + ")");
203 }
204 return preview_filename;
205 }
206 /** generatePreview(Path, Path) **/
207
208 /** Returns the underlying map of all the properties defined by this Document.
209 * @since 1.1.0
210 */
211 public Map<String,String> getAllProperties()
212 {
213 return this.properties;
214 }
215 /** getAllProperties() **/
216
217 /** Returns a list of the fields the current term appears in.
218 * @return HashSet a set of the terms that the current term appears in.
219 */
220 public Set<String> getFields()
221 {
222 // Returns null because there is no support for fields with file documents.
223 return Collections.emptySet();
224 }
225 /** getFields() **/
226
227 /** Gets the next term of the document.
228 * <B>NB:</B>Null string returned from getNextTerm() should
229 * be ignored. They do not signify the lack of any more terms.
230 * endOfDocument() should be used to check that.
231 * @return String the next term of the document. Null returns should be
232 * ignored.
233 */
234 public String getNextTerm()
235 {
236 return this.tokenizer.next();
237 }
238 /** getNextTerm() **/
239
240 /** Allows access to a named property of the Document. Examples might be URL,
241 * filename etc.
242 * @param name Name of the property. It is suggested, but not required that
243 * this name should not be case insensitive.
244 * @since 1.1.0
245 */
246 public String getProperty(String name)
247 {
248 return this.properties.get(name.toLowerCase());
249 }
250 /** getProperty(String name) **/
251
252 /** Returns a Reader object so client code can tokenise the document
253 * or deal with the document itself. Examples might be extracting URLs,
254 * language detection. */
255 public Reader getReader()
256 {
257 return this.reader;
258 }
259 /** getReader() **/
260}
261
Note: See TracBrowser for help on using the repository browser.