Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26191

Last change on this file since 26191 was 26190, checked in by jmt12, 12 years ago
Moving the StreamGobbler - used in both plugins to prevent a full STDERR buffer killing the import - into it's own class... my computer doesn't have an issue with exactly the same class occuring twice, but Medusa's one seems stricter in this regard
File size: 8.9 KB

Line
1	/**
2	* Adding support for Images in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.InputStream;
20	import java.io.StringReader;
21	import java.io.Reader;
22	import java.nio.file.Files;
23	import java.nio.file.Path;
24	import java.nio.file.Paths;
25	import java.util.Collections;
26	import java.util.Arrays;
27	import java.util.Map;
28	import java.util.Set;
29
30	import org.apache.log4j.Logger;
31	import org.terrier.indexing.StreamGobbler;
32	import org.terrier.indexing.tokenisation.TokenStream;
33	import org.terrier.indexing.tokenisation.Tokeniser;
34	import org.terrier.utility.ApplicationSetup;
35
36	public class ImageDocument
37	implements Document
38	{
39	/** A reference to the logger for messaging */
40	protected static final Logger logger = Logger.getLogger(FileDocument.class);
41	/** The map of properties (fields) for this document. */
42	protected Map<String,String> properties;
43	/** A reader built from a dummy text string. */
44	protected Reader reader;
45	/** A token stream produced by the configured tokeniser when feed the dummy
46	* reader.
47	*/
48	protected TokenStream tokenizer;
49
50	/ The preview filetype. /
51	protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
52	/ The preview size (width). /
53	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
54
55	/ Default constructor. /
56	protected ImageDocument() {}
57
58	/** Constructs an instance of the ImageDocument from the given input stream.
59	* @param docStream the input stream that reads the file.
60	* @param docProperties the initial properties (docno, filename)
61	* @param tok the tokeniser defined for this collection
62	*/
63	public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
64	{
65	logger.info("ImageDocument::ImageDocument()");
66	// Initialization from arguments
67	this.properties = default_properties;
68
69	// Set properties
70	logger.info("ImageDocument - extracting properties");
71	// A. Hardcoded properties
72	this.properties.put("parser", "ImageDocument");
73	this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
74	// B. Properties derived from filename
75	String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
76	this.properties.put("title", title);
77	String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
78	String target_filename = "doc." + ext;
79	this.properties.put("source","doc." + ext);
80	String assoc_filename = "D" + properties.get("docno");
81	this.properties.put("assocfile", assoc_filename);
82
83	// Copy (symlink) the file into place in the shared directory
84	Path source_path = Paths.get(properties.get("filename"));
85	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
86	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
87	Path target_path = assoc_path.resolve(target_filename);
88	if (target_path.toFile().exists())
89	{
90	logger.info("ImageDocument - removing existing (old) associated image");
91	try
92	{
93	Files.delete(target_path);
94	}
95	catch (Exception e)
96	{
97	logger.error("Exception while deleting old image: ", e);
98	}
99	}
100	logger.info("ImageDocument - symlinking image into assoc directory");
101	try
102	{
103	Files.createSymbolicLink(target_path, source_path);
104	}
105	// not supported? We'll try copying below
106	catch (UnsupportedOperationException ex)
107	{
108	}
109	// All other exceptions can be fatal
110	catch (Exception e)
111	{
112	logger.error("Exception while symlinking image: ", e);
113	}
114	// - copy if the file doesn't exist yet
115	if (!target_path.toFile().exists())
116	{
117	logger.info("ImageDocument - symlink filaed, copying instead");
118	try
119	{
120	Files.copy(source_path, target_path);
121	}
122	// Fatality!
123	catch (Exception e)
124	{
125	logger.error("Exception while copying image: ", e);
126	}
127	}
128
129	// Generate preview image
130	logger.info("ImageDocument - generate preview image");
131	try
132	{
133	String preview_filename = this.generatePreview(source_path, assoc_path);
134	this.properties.put("preview",preview_filename);
135	}
136	catch (Exception e)
137	{
138	logger.error("Exception while generating preview image: ", e);
139	}
140
141	// Create a dummy reader around some dummy text and then tokenize it
142	logger.info("ImageDocument - feed dummy text as token stream to indexer");
143	try
144	{
145	this.reader = new StringReader(this.properties.get("abstract"));
146	this.tokenizer = tok.tokenise(this.reader);
147	}
148	catch (Exception e)
149	{
150	logger.error("Exception while creating dummy text stream: ", e);
151	}
152	logger.info("ImageDocument - Complete!");
153	}
154	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
155
156	/** Returns true when the end of the document has been reached, and there
157	* are no other terms to be retrieved from it.
158	* @return boolean true if there are no more terms in the document, otherwise
159	* it returns false.
160	*/
161	public boolean endOfDocument()
162	{
163	return !this.tokenizer.hasNext();
164	}
165	/ endOfDocument() /
166
167	/** Use ImageMagick to generate a preview image.
168	* @pre assumes you have ImageMagick installed and available on Path
169	* @pre uses member variables preview_format and preview_width
170	* @return the filename of the preview image (within the assoc directory)
171	*/
172	private String generatePreview(Path source_path, Path assoc_path)
173	throws Exception
174	{
175	String preview_filename = "preview." + this.preview_format;
176	Path preview_path = assoc_path.resolve(preview_filename);
177	String convert_command[] = {
178	"convert",
179	source_path.toString(),
180	"-resize",
181	this.preview_width + "x",
182	preview_path.toString()
183	};
184	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
185	Process convert_process = Runtime.getRuntime().exec(convert_command);
186	// Gobble up the streams to prevent them hanging the process when buffers
187	// are full
188	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
189	convert_process_error_gobbler.start();
190	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
191	convert_process_input_gobbler.start();
192	// Let the conversion finish
193	int convert_status = convert_process.waitFor();
194	if (convert_status != 0 \|\| !preview_path.toFile().exists())
195	{
196	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
197	}
198	return preview_filename;
199	}
200	/ generatePreview(Path, Path) /
201
202	/** Returns the underlying map of all the properties defined by this Document.
203	* @since 1.1.0
204	*/
205	public Map<String,String> getAllProperties()
206	{
207	return this.properties;
208	}
209	/ getAllProperties() /
210
211	/** Returns a list of the fields the current term appears in.
212	* @return HashSet a set of the terms that the current term appears in.
213	*/
214	public Set<String> getFields()
215	{
216	// Returns null because there is no support for fields with file documents.
217	return Collections.emptySet();
218	}
219	/ getFields() /
220
221	/** Gets the next term of the document.
222	* <B>NB:</B>Null string returned from getNextTerm() should
223	* be ignored. They do not signify the lack of any more terms.
224	* endOfDocument() should be used to check that.
225	* @return String the next term of the document. Null returns should be
226	* ignored.
227	*/
228	public String getNextTerm()
229	{
230	return this.tokenizer.next();
231	}
232	/ getNextTerm() /
233
234	/** Allows access to a named property of the Document. Examples might be URL,
235	* filename etc.
236	* @param name Name of the property. It is suggested, but not required that
237	* this name should not be case insensitive.
238	* @since 1.1.0
239	*/
240	public String getProperty(String name)
241	{
242	return this.properties.get(name.toLowerCase());
243	}
244	/ getProperty(String name) /
245
246	/** Returns a Reader object so client code can tokenise the document
247	* or deal with the document itself. Examples might be extracting URLs,
248	* language detection. */
249	public Reader getReader()
250	{
251	return this.reader;
252	}
253	/ getReader() /
254	}
255

Note: See TracBrowser for help on using the repository browser.

Download in other formats: