Context Navigation

ImageDocument.java@ 29648

Last change on this file since 29648 was 29648, checked in by jmt12, 9 years ago
Extending the Image document class with SIFT processing so as to trigger greater CPU load. Makes use of stream gobbler... gobble-gobble
File size: 12.5 KB

Rev	Line
[26186]	1	/**
	2	* Adding support for Images in Terrier
	3	* @author: John Thompson, jmt12, #9826509
	4	*
	5	* The contents of this file are subject to the Mozilla Public License
	6	* Version 1.1 (the "License"); you may not use this file except in
	7	* compliance with the License. You may obtain a copy of the License at
	8	* http://www.mozilla.org/MPL/
	9	*
	10	* Software distributed under the License is distributed on an "AS IS"
	11	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
	12	* the License for the specific language governing rights and limitations
	13	* under the License.
	14	*
	15	* Copyright (c) 2011 The University of Waikato. All Rights Reserved.
	16	*/
	17	package org.terrier.indexing;
	18
	19	import java.io.InputStream;
	20	import java.io.StringReader;
	21	import java.io.Reader;
[26214]	22	import java.nio.charset.Charset;
[26186]	23	import java.nio.file.Files;
	24	import java.nio.file.Path;
	25	import java.nio.file.Paths;
[26214]	26	import java.security.MessageDigest;
	27	import java.security.NoSuchAlgorithmException;
[26186]	28	import java.util.Collections;
	29	import java.util.Arrays;
	30	import java.util.Map;
	31	import java.util.Set;
	32
	33	import org.apache.log4j.Logger;
[26190]	34	import org.terrier.indexing.StreamGobbler;
[26186]	35	import org.terrier.indexing.tokenisation.TokenStream;
	36	import org.terrier.indexing.tokenisation.Tokeniser;
	37	import org.terrier.utility.ApplicationSetup;
	38
	39	public class ImageDocument
	40	implements Document
	41	{
	42	/** A reference to the logger for messaging */
	43	protected static final Logger logger = Logger.getLogger(FileDocument.class);
	44	/** The map of properties (fields) for this document. */
	45	protected Map<String,String> properties;
	46	/** A reader built from a dummy text string. */
	47	protected Reader reader;
	48	/** A token stream produced by the configured tokeniser when feed the dummy
	49	* reader.
	50	*/
	51	protected TokenStream tokenizer;
	52
	53	/ The preview filetype. /
	54	protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
	55	/ The preview size (width). /
	56	protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
	57
	58	/ Default constructor. /
	59	protected ImageDocument() {}
	60
	61	/** Constructs an instance of the ImageDocument from the given input stream.
	62	* @param docStream the input stream that reads the file.
	63	* @param docProperties the initial properties (docno, filename)
	64	* @param tok the tokeniser defined for this collection
	65	*/
	66	public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
	67	{
	68	logger.info("ImageDocument::ImageDocument()");
	69	// Initialization from arguments
	70	this.properties = default_properties;
	71
	72	// Set properties
[26208]	73	logger.info("ImageDocument - current properties");
	74	for (Map.Entry<String, String> entry : this.properties.entrySet())
	75	{
	76	logger.info(entry.getKey() + "=" + entry.getValue());
	77	}
	78
[26186]	79	logger.info("ImageDocument - extracting properties");
	80	// A. Hardcoded properties
	81	this.properties.put("parser", "ImageDocument");
	82	this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
	83	// B. Properties derived from filename
[26214]	84	// - A simple title for the document
	85	String filepath = this.properties.get("filename");
	86	String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1);
[26186]	87	this.properties.put("title", title);
[26214]	88	String ext = filepath.substring(filepath.lastIndexOf(".") + 1);
	89	// - The name of the copy of the original document
[26186]	90	String target_filename = "doc." + ext;
	91	this.properties.put("source","doc." + ext);
[26214]	92	// - A unique associated directory. This gets a little tricky as we need
	93	// to create the directory at the same time if an effort to promote
	94	// synchronous behaviour
	95	String unique_id = this.generateHash(filepath);
	96	// - we start with the first 4 characters
	97	int offset = 0;
	98	String assoc_filename = "D" + unique_id.substring(offset, offset + 4);
	99	// - we add ".dir" as a suffix to the directory that actually contains
	100	// files (so the non-suffixed version contains nested directories)
	101	Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
	102	// - then we continue adding blocks of 4 characters until we get a
	103	// directory that doesn't already exist
	104	while (assoc_path.toFile().exists() && offset < unique_id.length())
[26207]	105	{
[26214]	106	offset += 4;
	107	assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4);
	108	assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir");
	109	}
	110	// - still not unique? but run out of unique_id... time to complain
	111	if (assoc_path.toFile().exists())
	112	{
	113	logger.error("ImageDoument - can't determine unique assocfilepath");
[26207]	114	System.exit(0);
	115	}
[26214]	116	// - create the directories quick... hopefully before someone else does
	117	assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
[26186]	118	this.properties.put("assocfile", assoc_filename);
	119
	120	// Copy (symlink) the file into place in the shared directory
	121	Path source_path = Paths.get(properties.get("filename"));
	122	Path target_path = assoc_path.resolve(target_filename);
	123	if (target_path.toFile().exists())
	124	{
	125	logger.info("ImageDocument - removing existing (old) associated image");
	126	try
	127	{
	128	Files.delete(target_path);
	129	}
	130	catch (Exception e)
	131	{
	132	logger.error("Exception while deleting old image: ", e);
	133	}
	134	}
	135	logger.info("ImageDocument - symlinking image into assoc directory");
	136	try
	137	{
	138	Files.createSymbolicLink(target_path, source_path);
	139	}
	140	// not supported? We'll try copying below
	141	catch (UnsupportedOperationException ex)
	142	{
	143	}
	144	// All other exceptions can be fatal
	145	catch (Exception e)
	146	{
	147	logger.error("Exception while symlinking image: ", e);
	148	}
	149	// - copy if the file doesn't exist yet
	150	if (!target_path.toFile().exists())
	151	{
	152	logger.info("ImageDocument - symlink filaed, copying instead");
	153	try
	154	{
	155	Files.copy(source_path, target_path);
	156	}
	157	// Fatality!
	158	catch (Exception e)
	159	{
	160	logger.error("Exception while copying image: ", e);
	161	}
	162	}
	163
	164	// Generate preview image
	165	logger.info("ImageDocument - generate preview image");
	166	try
	167	{
	168	String preview_filename = this.generatePreview(source_path, assoc_path);
	169	this.properties.put("preview",preview_filename);
	170	}
	171	catch (Exception e)
	172	{
	173	logger.error("Exception while generating preview image: ", e);
	174	}
	175
	176	// Create a dummy reader around some dummy text and then tokenize it
	177	logger.info("ImageDocument - feed dummy text as token stream to indexer");
	178	try
	179	{
	180	this.reader = new StringReader(this.properties.get("abstract"));
	181	this.tokenizer = tok.tokenise(this.reader);
	182	}
	183	catch (Exception e)
	184	{
	185	logger.error("Exception while creating dummy text stream: ", e);
	186	}
[29648]	187
	188	// Use OpenSIFT to generate a featureset (in Oxford format) for this image
	189	logger.info("ImageDocument - generate and record SIFT features");
	190	try
	191	{
	192	String sift_command[] = {
	193	"siftfeat",
	194	"-x",
	195	source_path.toString()
	196	};
	197	logger.info("ImageDocument - sift command: " + Arrays.toString(sift_command));
	198	Process sift_process = Runtime.getRuntime().exec(sift_command);
	199	// we'd usually send STDERR to /dev/null, but a streamgobbler is easier
	200	// in Java
	201	StreamGobbler sift_process_error_gobbler = new StreamGobbler(sift_process.getErrorStream());
	202	sift_process_error_gobbler.start();
	203	// the SIFT features, in Oxford format, will arrive from STDOUT
	204	BufferedReader sift_br = new BufferedReader(new InputStreamReader(sift_process.getInputStream()));
	205	String line;
	206	StringBuffer oxford_features;
	207	while ((line = sift_br.readLine()) != null)
	208	{
	209	oxford_features.append(line);
	210	}
	211	// this command blocks until process completes (emit return value) which
	212	// should be shortly after it emits the last line of SIFT feature data
	213	int sift_status = sift_process.waitFor();
	214	this.properties.put("sift", oxford_features.toString());
	215	}
	216	catch (Exception e)
	217	{
	218	logger.error("Exception while generating preview image: ", e);
	219	}
	220
[26186]	221	logger.info("ImageDocument - Complete!");
	222	}
	223	/ ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) /
	224
	225	/** Returns true when the end of the document has been reached, and there
	226	* are no other terms to be retrieved from it.
	227	* @return boolean true if there are no more terms in the document, otherwise
	228	* it returns false.
	229	*/
	230	public boolean endOfDocument()
	231	{
	232	return !this.tokenizer.hasNext();
	233	}
	234	/ endOfDocument() /
	235
	236	/** Use ImageMagick to generate a preview image.
	237	* @pre assumes you have ImageMagick installed and available on Path
	238	* @pre uses member variables preview_format and preview_width
	239	* @return the filename of the preview image (within the assoc directory)
	240	*/
	241	private String generatePreview(Path source_path, Path assoc_path)
	242	throws Exception
	243	{
	244	String preview_filename = "preview." + this.preview_format;
	245	Path preview_path = assoc_path.resolve(preview_filename);
	246	String convert_command[] = {
	247	"convert",
	248	source_path.toString(),
	249	"-resize",
	250	this.preview_width + "x",
	251	preview_path.toString()
	252	};
	253	logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
	254	Process convert_process = Runtime.getRuntime().exec(convert_command);
	255	// Gobble up the streams to prevent them hanging the process when buffers
	256	// are full
	257	StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
	258	convert_process_error_gobbler.start();
	259	StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
	260	convert_process_input_gobbler.start();
	261	// Let the conversion finish
	262	int convert_status = convert_process.waitFor();
	263	if (convert_status != 0 \|\| !preview_path.toFile().exists())
	264	{
	265	throw new Exception("Convert command failed (exit status: " + convert_status + ")");
	266	}
	267	return preview_filename;
	268	}
	269	/ generatePreview(Path, Path) /
	270
	271	/** Returns the underlying map of all the properties defined by this Document.
	272	* @since 1.1.0
	273	*/
	274	public Map<String,String> getAllProperties()
	275	{
	276	return this.properties;
	277	}
	278	/ getAllProperties() /
	279
	280	/** Returns a list of the fields the current term appears in.
	281	* @return HashSet a set of the terms that the current term appears in.
	282	*/
	283	public Set<String> getFields()
	284	{
	285	// Returns null because there is no support for fields with file documents.
	286	return Collections.emptySet();
	287	}
	288	/ getFields() /
	289
	290	/** Gets the next term of the document.
	291	* <B>NB:</B>Null string returned from getNextTerm() should
	292	* be ignored. They do not signify the lack of any more terms.
	293	* endOfDocument() should be used to check that.
	294	* @return String the next term of the document. Null returns should be
	295	* ignored.
	296	*/
	297	public String getNextTerm()
	298	{
	299	return this.tokenizer.next();
	300	}
	301	/ getNextTerm() /
	302
	303	/** Allows access to a named property of the Document. Examples might be URL,
	304	* filename etc.
	305	* @param name Name of the property. It is suggested, but not required that
	306	* this name should not be case insensitive.
	307	* @since 1.1.0
	308	*/
	309	public String getProperty(String name)
	310	{
	311	return this.properties.get(name.toLowerCase());
	312	}
	313	/ getProperty(String name) /
	314
	315	/** Returns a Reader object so client code can tokenise the document
	316	* or deal with the document itself. Examples might be extracting URLs,
	317	* language detection. */
	318	public Reader getReader()
	319	{
	320	return this.reader;
	321	}
	322	/ getReader() /
[26214]	323
	324	/**
	325	*/
	326	private String generateHash(String string)
	327	{
	328	StringBuffer sb = new StringBuffer();
	329	try
	330	{
	331	final MessageDigest message_digest = MessageDigest.getInstance("MD5");
	332	message_digest.reset();
	333	message_digest.update(string.getBytes(Charset.forName("UTF8")));
	334	final byte[] result_bytes = message_digest.digest();
	335	for (int i = 0; i < result_bytes.length; ++i)
	336	{
	337	sb.append(Integer.toHexString((result_bytes[i] & 0xFF) \| 0x100).substring(1,3));
	338	}
	339	}
	340	catch (NoSuchAlgorithmException e)
	341	{
	342	System.err.println("Exception: " + e);
	343	System.exit(0);
	344	}
	345	return sb.toString();
	346	}
	347	/ generateHash(String) /
[26186]	348	}
	349

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java@ 29648

Download in other formats: