Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 26214

Last change on this file since 26214 was 26214, checked in by jmt12, 12 years ago
New hash based generation for associated files directory - so docno is no longer essential
File size: 11.2 KB

Line
1	/**
2	* Adding support for Images in Terrier
3	* @author: John Thompson, jmt12, #9826509
4	*
5	* The contents of this file are subject to the Mozilla Public License
6	* Version 1.1 (the "License"); you may not use this file except in
7	* compliance with the License. You may obtain a copy of the License at
8	* http://www.mozilla.org/MPL/
9	*
10	* Software distributed under the License is distributed on an "AS IS"
11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12	* the License for the specific language governing rights and limitations
13	* under the License.
14	*
15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16	*/
17	package org.terrier.indexing;
18
19	import java.io.InputStream;
20	import java.io.StringReader;
21	import java.io.Reader;
22	import java.nio.charset.Charset;
23	import java.nio.file.Files;
24	import java.nio.file.Path;
25	import java.nio.file.Paths;
26	import java.security.MessageDigest;
27	import java.security.NoSuchAlgorithmException;
28	import java.util.Collections;
29	import java.util.Arrays;
30	import java.util.Map;
31	import java.util.Set;
32
33	import org.apache.log4j.Logger;
34	import org.terrier.indexing.StreamGobbler;
35	import org.terrier.indexing.tokenisation.TokenStream;
36	import org.terrier.indexing.tokenisation.Tokeniser;
37	import org.terrier.utility.ApplicationSetup;
38
39	public class ImageDocument
40	implements Document
41	{
42	/** A reference to the logger for messaging */
43	protected static final Logger logger = Logger.getLogger(FileDocument.class);
44	/** The map of properties (fields) for this document. */
45	protected Map<String,String> properties;
46	/** A reader built from a dummy text string. */
47	protected Reader reader;
48	/** A token stream produced by the configured tokeniser when feed the dummy
49	* reader.
50	*/
51	protected TokenStream tokenizer;
52
53	/ The preview filetype. /
54	protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
55	/ The preview size (width). /
56	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
57
58	/ Default constructor. /
59	protected ImageDocument() {}
60
61	/** Constructs an instance of the ImageDocument from the given input stream.
62	* @param docStream the input stream that reads the file.
63	* @param docProperties the initial properties (docno, filename)
64	* @param tok the tokeniser defined for this collection
65	*/
66	public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
67	{
68	logger.info("ImageDocument::ImageDocument()");
69	// Initialization from arguments
70	this.properties = default_properties;
71
72	// Set properties
73	logger.info("ImageDocument - current properties");
74	for (Map.Entry<String, String> entry : this.properties.entrySet())
75	{
76	logger.info(entry.getKey() + "=" + entry.getValue());
77	}
78
79	logger.info("ImageDocument - extracting properties");
80	// A. Hardcoded properties
81	this.properties.put("parser", "ImageDocument");
82	this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
83	// B. Properties derived from filename
84	// - A simple title for the document
85	String filepath = this.properties.get("filename");
86	String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
87	this.properties.put("title", title);
88	String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
89	// - The name of the copy of the original document
90	String target_filename = "doc." + ext;
91	this.properties.put("source","doc." + ext);
92	// - A unique associated directory. This gets a little tricky as we need
93	// to create the directory at the same time if an effort to promote
94	// synchronous behaviour
95	String unique_id = this.generateHash(filepath);
96	// - we start with the first 4 characters
97	int offset = 0;
98	String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
99	// - we add ".dir" as a suffix to the directory that actually contains
100	// files (so the non-suffixed version contains nested directories)
101	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
102	// - then we continue adding blocks of 4 characters until we get a
103	// directory that doesn't already exist
104	while (assoc_path.toFile().exists() && offset < unique_id.length())
105	{
106	offset += 4;
107	assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
108	assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
109	}
110	// - still not unique? but run out of unique_id... time to complain
111	if (assoc_path.toFile().exists())
112	{
113	logger.error("ImageDoument - can't determine unique assocfilepath");
114	System.exit(0);
115	}
116	// - create the directories quick... hopefully before someone else does
117	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
118	this.properties.put("assocfile", assoc_filename);
119
120	// Copy (symlink) the file into place in the shared directory
121	Path source_path = Paths.get(properties.get("filename"));
122	Path target_path = assoc_path.resolve(target_filename);
123	if (target_path.toFile().exists())
124	{
125	logger.info("ImageDocument - removing existing (old) associated image");
126	try
127	{
128	Files.delete(target_path);
129	}
130	catch (Exception e)
131	{
132	logger.error("Exception while deleting old image: ", e);
133	}
134	}
135	logger.info("ImageDocument - symlinking image into assoc directory");
136	try
137	{
138	Files.createSymbolicLink(target_path, source_path);
139	}
140	// not supported? We'll try copying below
141	catch (UnsupportedOperationException ex)
142	{
143	}
144	// All other exceptions can be fatal
145	catch (Exception e)
146	{
147	logger.error("Exception while symlinking image: ", e);
148	}
149	// - copy if the file doesn't exist yet
150	if (!target_path.toFile().exists())
151	{
152	logger.info("ImageDocument - symlink filaed, copying instead");
153	try
154	{
155	Files.copy(source_path, target_path);
156	}
157	// Fatality!
158	catch (Exception e)
159	{
160	logger.error("Exception while copying image: ", e);
161	}
162	}
163
164	// Generate preview image
165	logger.info("ImageDocument - generate preview image");
166	try
167	{
168	String preview_filename = this.generatePreview(source_path, assoc_path);
169	this.properties.put("preview",preview_filename);
170	}
171	catch (Exception e)
172	{
173	logger.error("Exception while generating preview image: ", e);
174	}
175
176	// Create a dummy reader around some dummy text and then tokenize it
177	logger.info("ImageDocument - feed dummy text as token stream to indexer");
178	try
179	{
180	this.reader = new StringReader(this.properties.get("abstract"));
181	this.tokenizer = tok.tokenise(this.reader);
182	}
183	catch (Exception e)
184	{
185	logger.error("Exception while creating dummy text stream: ", e);
186	}
187	logger.info("ImageDocument - Complete!");
188	}
189	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
190
191	/** Returns true when the end of the document has been reached, and there
192	* are no other terms to be retrieved from it.
193	* @return boolean true if there are no more terms in the document, otherwise
194	* it returns false.
195	*/
196	public boolean endOfDocument()
197	{
198	return !this.tokenizer.hasNext();
199	}
200	/ endOfDocument() /
201
202	/** Use ImageMagick to generate a preview image.
203	* @pre assumes you have ImageMagick installed and available on Path
204	* @pre uses member variables preview_format and preview_width
205	* @return the filename of the preview image (within the assoc directory)
206	*/
207	private String generatePreview(Path source_path, Path assoc_path)
208	throws Exception
209	{
210	String preview_filename = "preview." + this.preview_format;
211	Path preview_path = assoc_path.resolve(preview_filename);
212	String convert_command[] = {
213	"convert",
214	source_path.toString(),
215	"-resize",
216	this.preview_width + "x",
217	preview_path.toString()
218	};
219	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
220	Process convert_process = Runtime.getRuntime().exec(convert_command);
221	// Gobble up the streams to prevent them hanging the process when buffers
222	// are full
223	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
224	convert_process_error_gobbler.start();
225	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
226	convert_process_input_gobbler.start();
227	// Let the conversion finish
228	int convert_status = convert_process.waitFor();
229	if (convert_status != 0 \|\| !preview_path.toFile().exists())
230	{
231	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
232	}
233	return preview_filename;
234	}
235	/ generatePreview(Path, Path) /
236
237	/** Returns the underlying map of all the properties defined by this Document.
238	* @since 1.1.0
239	*/
240	public Map<String,String> getAllProperties()
241	{
242	return this.properties;
243	}
244	/ getAllProperties() /
245
246	/** Returns a list of the fields the current term appears in.
247	* @return HashSet a set of the terms that the current term appears in.
248	*/
249	public Set<String> getFields()
250	{
251	// Returns null because there is no support for fields with file documents.
252	return Collections.emptySet();
253	}
254	/ getFields() /
255
256	/** Gets the next term of the document.
257	* <B>NB:</B>Null string returned from getNextTerm() should
258	* be ignored. They do not signify the lack of any more terms.
259	* endOfDocument() should be used to check that.
260	* @return String the next term of the document. Null returns should be
261	* ignored.
262	*/
263	public String getNextTerm()
264	{
265	return this.tokenizer.next();
266	}
267	/ getNextTerm() /
268
269	/** Allows access to a named property of the Document. Examples might be URL,
270	* filename etc.
271	* @param name Name of the property. It is suggested, but not required that
272	* this name should not be case insensitive.
273	* @since 1.1.0
274	*/
275	public String getProperty(String name)
276	{
277	return this.properties.get(name.toLowerCase());
278	}
279	/ getProperty(String name) /
280
281	/** Returns a Reader object so client code can tokenise the document
282	* or deal with the document itself. Examples might be extracting URLs,
283	* language detection. */
284	public Reader getReader()
285	{
286	return this.reader;
287	}
288	/ getReader() /
289
290	/**
291	*/
292	private String generateHash(String string)
293	{
294	StringBuffer sb = new StringBuffer();
295	try
296	{
297	final MessageDigest message_digest = MessageDigest.getInstance("MD5");
298	message_digest.reset();
299	message_digest.update(string.getBytes(Charset.forName("UTF8")));
300	final byte[] result_bytes = message_digest.digest();
301	for (int i = 0; i < result_bytes.length; ++i)
302	{
303	sb.append(Integer.toHexString((result_bytes[i] & 0xFF) \| 0x100).substring(1,3));
304	}
305	}
306	catch (NoSuchAlgorithmException e)
307	{
308	System.err.println("Exception: " + e);
309	System.exit(0);
310	}
311	return sb.toString();
312	}
313	/ generateHash(String) /
314	}
315

Note: See TracBrowser for help on using the repository browser.

Download in other formats: