Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26186

Last change on this file since 26186 was 26186, checked in by jmt12, 12 years ago
Adding in (optional) support for video and image processing in DSpace and Terrier. These kinda belong here as they depend on the video-and-audio support (like MediaInfo, HandbrakeCLI, and Hive2) to work
File size: 10.1 KB

Line
1	/**
2	* Adding support for Images in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.BufferedOutputStream;
20	import java.io.BufferedReader;
21	import java.io.FileOutputStream;
22	import java.io.InputStream;
23	import java.io.InputStreamReader;
24	import java.io.IOException;
25	import java.io.PrintWriter;
26	import java.io.StringReader;
27	import java.io.Reader;
28	import java.lang.Thread;
29	import java.nio.file.Files;
30	import java.nio.file.Path;
31	import java.nio.file.Paths;
32	import java.util.Collections;
33	import java.util.Arrays;
34	import java.util.Map;
35	import java.util.Set;
36
37	import org.apache.log4j.Logger;
38	import org.terrier.indexing.tokenisation.TokenStream;
39	import org.terrier.indexing.tokenisation.Tokeniser;
40	import org.terrier.utility.ApplicationSetup;
41
42	public class ImageDocument
43	implements Document
44	{
45	/** A reference to the logger for messaging */
46	protected static final Logger logger = Logger.getLogger(FileDocument.class);
47	/** The map of properties (fields) for this document. */
48	protected Map<String,String> properties;
49	/** A reader built from a dummy text string. */
50	protected Reader reader;
51	/** A token stream produced by the configured tokeniser when feed the dummy
52	* reader.
53	*/
54	protected TokenStream tokenizer;
55
56	/ The preview filetype. /
57	protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
58	/ The preview size (width). /
59	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
60
61	/ Default constructor. /
62	protected ImageDocument() {}
63
64	/** Constructs an instance of the ImageDocument from the given input stream.
65	* @param docStream the input stream that reads the file.
66	* @param docProperties the initial properties (docno, filename)
67	* @param tok the tokeniser defined for this collection
68	*/
69	public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
70	{
71	logger.info("ImageDocument::ImageDocument()");
72	// Initialization from arguments
73	this.properties = default_properties;
74
75	// Set properties
76	logger.info("ImageDocument - extracting properties");
77	// A. Hardcoded properties
78	this.properties.put("parser", "ImageDocument");
79	this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
80	// B. Properties derived from filename
81	String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
82	this.properties.put("title", title);
83	String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
84	String target_filename = "doc." + ext;
85	this.properties.put("source","doc." + ext);
86	String assoc_filename = "D" + properties.get("docno");
87	this.properties.put("assocfile", assoc_filename);
88
89	// Copy (symlink) the file into place in the shared directory
90	Path source_path = Paths.get(properties.get("filename"));
91	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
92	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
93	Path target_path = assoc_path.resolve(target_filename);
94	if (target_path.toFile().exists())
95	{
96	logger.info("ImageDocument - removing existing (old) associated image");
97	try
98	{
99	Files.delete(target_path);
100	}
101	catch (Exception e)
102	{
103	logger.error("Exception while deleting old image: ", e);
104	}
105	}
106	logger.info("ImageDocument - symlinking image into assoc directory");
107	try
108	{
109	Files.createSymbolicLink(target_path, source_path);
110	}
111	// not supported? We'll try copying below
112	catch (UnsupportedOperationException ex)
113	{
114	}
115	// All other exceptions can be fatal
116	catch (Exception e)
117	{
118	logger.error("Exception while symlinking image: ", e);
119	}
120	// - copy if the file doesn't exist yet
121	if (!target_path.toFile().exists())
122	{
123	logger.info("ImageDocument - symlink filaed, copying instead");
124	try
125	{
126	Files.copy(source_path, target_path);
127	}
128	// Fatality!
129	catch (Exception e)
130	{
131	logger.error("Exception while copying image: ", e);
132	}
133	}
134
135	// Generate preview image
136	logger.info("ImageDocument - generate preview image");
137	try
138	{
139	String preview_filename = this.generatePreview(source_path, assoc_path);
140	this.properties.put("preview",preview_filename);
141	}
142	catch (Exception e)
143	{
144	logger.error("Exception while generating preview image: ", e);
145	}
146
147	// Create a dummy reader around some dummy text and then tokenize it
148	logger.info("ImageDocument - feed dummy text as token stream to indexer");
149	try
150	{
151	this.reader = new StringReader(this.properties.get("abstract"));
152	this.tokenizer = tok.tokenise(this.reader);
153	}
154	catch (Exception e)
155	{
156	logger.error("Exception while creating dummy text stream: ", e);
157	}
158	logger.info("ImageDocument - Complete!");
159	}
160	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
161
162	/** Returns true when the end of the document has been reached, and there
163	* are no other terms to be retrieved from it.
164	* @return boolean true if there are no more terms in the document, otherwise
165	* it returns false.
166	*/
167	public boolean endOfDocument()
168	{
169	return !this.tokenizer.hasNext();
170	}
171	/ endOfDocument() /
172
173	/** Use ImageMagick to generate a preview image.
174	* @pre assumes you have ImageMagick installed and available on Path
175	* @pre uses member variables preview_format and preview_width
176	* @return the filename of the preview image (within the assoc directory)
177	*/
178	private String generatePreview(Path source_path, Path assoc_path)
179	throws Exception
180	{
181	String preview_filename = "preview." + this.preview_format;
182	Path preview_path = assoc_path.resolve(preview_filename);
183	String convert_command[] = {
184	"convert",
185	source_path.toString(),
186	"-resize",
187	this.preview_width + "x",
188	preview_path.toString()
189	};
190	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
191	Process convert_process = Runtime.getRuntime().exec(convert_command);
192	// Gobble up the streams to prevent them hanging the process when buffers
193	// are full
194	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
195	convert_process_error_gobbler.start();
196	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
197	convert_process_input_gobbler.start();
198	// Let the conversion finish
199	int convert_status = convert_process.waitFor();
200	if (convert_status != 0 \|\| !preview_path.toFile().exists())
201	{
202	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
203	}
204	return preview_filename;
205	}
206	/ generatePreview(Path, Path) /
207
208	/** Returns the underlying map of all the properties defined by this Document.
209	* @since 1.1.0
210	*/
211	public Map<String,String> getAllProperties()
212	{
213	return this.properties;
214	}
215	/ getAllProperties() /
216
217	/** Returns a list of the fields the current term appears in.
218	* @return HashSet a set of the terms that the current term appears in.
219	*/
220	public Set<String> getFields()
221	{
222	// Returns null because there is no support for fields with file documents.
223	return Collections.emptySet();
224	}
225	/ getFields() /
226
227	/** Gets the next term of the document.
228	* <B>NB:</B>Null string returned from getNextTerm() should
229	* be ignored. They do not signify the lack of any more terms.
230	* endOfDocument() should be used to check that.
231	* @return String the next term of the document. Null returns should be
232	* ignored.
233	*/
234	public String getNextTerm()
235	{
236	return this.tokenizer.next();
237	}
238	/ getNextTerm() /
239
240	/** Allows access to a named property of the Document. Examples might be URL,
241	* filename etc.
242	* @param name Name of the property. It is suggested, but not required that
243	* this name should not be case insensitive.
244	* @since 1.1.0
245	*/
246	public String getProperty(String name)
247	{
248	return this.properties.get(name.toLowerCase());
249	}
250	/ getProperty(String name) /
251
252	/** Returns a Reader object so client code can tokenise the document
253	* or deal with the document itself. Examples might be extracting URLs,
254	* language detection. */
255	public Reader getReader()
256	{
257	return this.reader;
258	}
259	/ getReader() /
260	}
261
262	class StreamGobbler
263	extends Thread
264	{
265	InputStream is;
266	String file_path;
267	boolean output_to_file;
268
269	StreamGobbler(InputStream is)
270	{
271	this.is = is;
272	this.output_to_file = false;
273	}
274
275	StreamGobbler(InputStream is, String file_path)
276	{
277	this.is = is;
278	this.file_path = file_path;
279	this.output_to_file = true;
280	}
281
282	public void run()
283	{
284	try
285	{
286	InputStreamReader isr = new InputStreamReader(is);
287	BufferedReader br = new BufferedReader(isr);
288	String line = null;
289	if (output_to_file)
290	{
291	PrintWriter pw = new PrintWriter(new BufferedOutputStream(new FileOutputStream(file_path)));
292	while ( (line = br.readLine()) != null)
293	{
294	pw.println(line);
295	}
296	pw.flush();
297	pw.close();
298	}
299	else
300	{
301	while ( (line = br.readLine()) != null)
302	{
303	// Do nothing - equivalent to > /dev/null
304	}
305	}
306	}
307	catch (IOException ioe)
308	{
309	ioe.printStackTrace();
310	}
311	}
312	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: