/** * Adding support for Images in Terrier * @author: John Thompson, jmt12, #9826509 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See * the License for the specific language governing rights and limitations * under the License. * * Copyright (c) 2011 The University of Waikato. All Rights Reserved. */ package org.terrier.indexing; import java.io.InputStream; import java.io.StringReader; import java.io.Reader; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Collections; import java.util.Arrays; import java.util.Map; import java.util.Set; import org.apache.log4j.Logger; import org.terrier.indexing.StreamGobbler; import org.terrier.indexing.tokenisation.TokenStream; import org.terrier.indexing.tokenisation.Tokeniser; import org.terrier.utility.ApplicationSetup; public class ImageDocument implements Document { /** A reference to the logger for messaging */ protected static final Logger logger = Logger.getLogger(FileDocument.class); /** The map of properties (fields) for this document. */ protected Map properties; /** A reader built from a dummy text string. */ protected Reader reader; /** A token stream produced by the configured tokeniser when feed the dummy * reader. */ protected TokenStream tokenizer; /** The preview filetype. **/ protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg"); /** The preview size (width). **/ protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200"); /** Default constructor. **/ protected ImageDocument() {} /** Constructs an instance of the ImageDocument from the given input stream. * @param docStream the input stream that reads the file. * @param docProperties the initial properties (docno, filename) * @param tok the tokeniser defined for this collection */ public ImageDocument(InputStream istream, Map default_properties, Tokeniser tok) { logger.info("ImageDocument::ImageDocument()"); // Initialization from arguments this.properties = default_properties; // Set properties logger.info("ImageDocument - current properties"); for (Map.Entry entry : this.properties.entrySet()) { logger.info(entry.getKey() + "=" + entry.getValue()); } logger.info("ImageDocument - extracting properties"); // A. Hardcoded properties this.properties.put("parser", "ImageDocument"); this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing."); // B. Properties derived from filename // - A simple title for the document String filepath = this.properties.get("filename"); String title = filepath.substring(filepath.lastIndexOf(System.getProperty("file.separator")) + 1); this.properties.put("title", title); String ext = filepath.substring(filepath.lastIndexOf(".") + 1); // - The name of the copy of the original document String target_filename = "doc." + ext; this.properties.put("source","doc." + ext); // - A unique associated directory. This gets a little tricky as we need // to create the directory at the same time if an effort to promote // synchronous behaviour String unique_id = this.generateHash(filepath); // - we start with the first 4 characters int offset = 0; String assoc_filename = "D" + unique_id.substring(offset, offset + 4); // - we add ".dir" as a suffix to the directory that actually contains // files (so the non-suffixed version contains nested directories) Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir"); // - then we continue adding blocks of 4 characters until we get a // directory that doesn't already exist while (assoc_path.toFile().exists() && offset < unique_id.length()) { offset += 4; assoc_filename += System.getProperty("file.separator") + unique_id.substring(offset, offset + 4); assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename + ".dir"); } // - still not unique? but run out of unique_id... time to complain if (assoc_path.toFile().exists()) { logger.error("ImageDoument - can't determine unique assocfilepath"); System.exit(0); } // - create the directories quick... hopefully before someone else does assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this this.properties.put("assocfile", assoc_filename); // Copy (symlink) the file into place in the shared directory Path source_path = Paths.get(properties.get("filename")); Path target_path = assoc_path.resolve(target_filename); if (target_path.toFile().exists()) { logger.info("ImageDocument - removing existing (old) associated image"); try { Files.delete(target_path); } catch (Exception e) { logger.error("Exception while deleting old image: ", e); } } logger.info("ImageDocument - symlinking image into assoc directory"); try { Files.createSymbolicLink(target_path, source_path); } // not supported? We'll try copying below catch (UnsupportedOperationException ex) { } // All other exceptions can be fatal catch (Exception e) { logger.error("Exception while symlinking image: ", e); } // - copy if the file doesn't exist yet if (!target_path.toFile().exists()) { logger.info("ImageDocument - symlink filaed, copying instead"); try { Files.copy(source_path, target_path); } // Fatality! catch (Exception e) { logger.error("Exception while copying image: ", e); } } // Generate preview image logger.info("ImageDocument - generate preview image"); try { String preview_filename = this.generatePreview(source_path, assoc_path); this.properties.put("preview",preview_filename); } catch (Exception e) { logger.error("Exception while generating preview image: ", e); } // Create a dummy reader around some dummy text and then tokenize it logger.info("ImageDocument - feed dummy text as token stream to indexer"); try { this.reader = new StringReader(this.properties.get("abstract")); this.tokenizer = tok.tokenise(this.reader); } catch (Exception e) { logger.error("Exception while creating dummy text stream: ", e); } // Use OpenSIFT to generate a featureset (in Oxford format) for this image logger.info("ImageDocument - generate and record SIFT features"); try { String sift_command[] = { "siftfeat", "-x", source_path.toString() }; logger.info("ImageDocument - sift command: " + Arrays.toString(sift_command)); Process sift_process = Runtime.getRuntime().exec(sift_command); // we'd usually send STDERR to /dev/null, but a streamgobbler is easier // in Java StreamGobbler sift_process_error_gobbler = new StreamGobbler(sift_process.getErrorStream()); sift_process_error_gobbler.start(); // the SIFT features, in Oxford format, will arrive from STDOUT BufferedReader sift_br = new BufferedReader(new InputStreamReader(sift_process.getInputStream())); String line; StringBuffer oxford_features; while ((line = sift_br.readLine()) != null) { oxford_features.append(line); } // this command blocks until process completes (emit return value) which // should be shortly after it emits the last line of SIFT feature data int sift_status = sift_process.waitFor(); this.properties.put("sift", oxford_features.toString()); } catch (Exception e) { logger.error("Exception while generating preview image: ", e); } logger.info("ImageDocument - Complete!"); } /** ImageDocument(InputStream istream, Map default_properties, Tokeniser tok) **/ /** Returns true when the end of the document has been reached, and there * are no other terms to be retrieved from it. * @return boolean true if there are no more terms in the document, otherwise * it returns false. */ public boolean endOfDocument() { return !this.tokenizer.hasNext(); } /** endOfDocument() **/ /** Use ImageMagick to generate a preview image. * @pre assumes you have ImageMagick installed and available on Path * @pre uses member variables preview_format and preview_width * @return the filename of the preview image (within the assoc directory) */ private String generatePreview(Path source_path, Path assoc_path) throws Exception { String preview_filename = "preview." + this.preview_format; Path preview_path = assoc_path.resolve(preview_filename); String convert_command[] = { "convert", source_path.toString(), "-resize", this.preview_width + "x", preview_path.toString() }; logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command)); Process convert_process = Runtime.getRuntime().exec(convert_command); // Gobble up the streams to prevent them hanging the process when buffers // are full StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream()); convert_process_error_gobbler.start(); StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream()); convert_process_input_gobbler.start(); // Let the conversion finish int convert_status = convert_process.waitFor(); if (convert_status != 0 || !preview_path.toFile().exists()) { throw new Exception("Convert command failed (exit status: " + convert_status + ")"); } return preview_filename; } /** generatePreview(Path, Path) **/ /** Returns the underlying map of all the properties defined by this Document. * @since 1.1.0 */ public Map getAllProperties() { return this.properties; } /** getAllProperties() **/ /** Returns a list of the fields the current term appears in. * @return HashSet a set of the terms that the current term appears in. */ public Set getFields() { // Returns null because there is no support for fields with file documents. return Collections.emptySet(); } /** getFields() **/ /** Gets the next term of the document. * NB:Null string returned from getNextTerm() should * be ignored. They do not signify the lack of any more terms. * endOfDocument() should be used to check that. * @return String the next term of the document. Null returns should be * ignored. */ public String getNextTerm() { return this.tokenizer.next(); } /** getNextTerm() **/ /** Allows access to a named property of the Document. Examples might be URL, * filename etc. * @param name Name of the property. It is suggested, but not required that * this name should not be case insensitive. * @since 1.1.0 */ public String getProperty(String name) { return this.properties.get(name.toLowerCase()); } /** getProperty(String name) **/ /** Returns a Reader object so client code can tokenise the document * or deal with the document itself. Examples might be extracting URLs, * language detection. */ public Reader getReader() { return this.reader; } /** getReader() **/ /** */ private String generateHash(String string) { StringBuffer sb = new StringBuffer(); try { final MessageDigest message_digest = MessageDigest.getInstance("MD5"); message_digest.reset(); message_digest.update(string.getBytes(Charset.forName("UTF8"))); final byte[] result_bytes = message_digest.digest(); for (int i = 0; i < result_bytes.length; ++i) { sb.append(Integer.toHexString((result_bytes[i] & 0xFF) | 0x100).substring(1,3)); } } catch (NoSuchAlgorithmException e) { System.err.println("Exception: " + e); System.exit(0); } return sb.toString(); } /** generateHash(String) **/ }