root/gs2-extensions/video-and-audio/trunk/src/opt/Terrier/ImageDocument.java @ 26207

Revision 26207, 9.1 KB (checked in by jmt12, 6 years ago)

Sanity check trying to track down bogus docno value

Line 
1/**
2 *  Adding support for Images in Terrier
3 *  @author: John Thompson, jmt12, #9826509
4 *
5 *  The contents of this file are subject to the Mozilla Public License
6 *  Version 1.1 (the "License"); you may not use this file except in
7 *  compliance with the License. You may obtain a copy of the License at
8 *  http://www.mozilla.org/MPL/
9 *
10 *  Software distributed under the License is distributed on an "AS IS"
11 *  basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
12 *  the License for the specific language governing rights and limitations
13 *  under the License.
14 *
15 *  Copyright (c) 2011 The University of Waikato. All Rights Reserved.
16 */
17package org.terrier.indexing;
18
19import java.io.InputStream;
20import java.io.StringReader;
21import java.io.Reader;
22import java.nio.file.Files;
23import java.nio.file.Path;
24import java.nio.file.Paths;
25import java.util.Collections;
26import java.util.Arrays;
27import java.util.Map;
28import java.util.Set;
29
30import org.apache.log4j.Logger;
31import org.terrier.indexing.StreamGobbler;
32import org.terrier.indexing.tokenisation.TokenStream;
33import org.terrier.indexing.tokenisation.Tokeniser;
34import org.terrier.utility.ApplicationSetup;
35
36public class ImageDocument
37  implements Document
38{
39  /** A reference to the logger for messaging */
40  protected static final Logger logger = Logger.getLogger(FileDocument.class);
41  /** The map of properties (fields) for this document. */
42  protected Map<String,String> properties;
43  /** A reader built from a dummy text string. */
44  protected Reader reader;
45  /** A token stream produced by the configured tokeniser when feed the dummy
46   *  reader.
47   */
48  protected TokenStream tokenizer;
49
50  /** The preview filetype. **/
51  protected final String preview_format = ApplicationSetup.getProperty("ImageDocument.preview_format", "jpg");
52  /** The preview size (width). **/
53  protected final String preview_width = ApplicationSetup.getProperty("ImageDocument.preview_width", "200");
54
55  /** Default constructor. **/
56  protected ImageDocument() {}
57
58  /** Constructs an instance of the ImageDocument from the given input stream.
59   *  @param docStream the input stream that reads the file.
60   *  @param docProperties the initial properties (docno, filename)
61   *  @param tok the tokeniser defined for this collection
62   */
63  public ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok)
64  {
65    logger.info("ImageDocument::ImageDocument()");
66    // Initialization from arguments
67    this.properties = default_properties;
68
69    // Set properties
70    logger.info("ImageDocument - extracting properties");
71    // A. Hardcoded properties
72    this.properties.put("parser", "ImageDocument");
73    this.properties.put("abstract", "This is an image so here is some dummy text to prevent indexer failing.");
74    // B. Properties derived from filename
75    String title = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(System.getProperty("file.separator")) + 1);
76    this.properties.put("title", title);
77    String ext = this.properties.get("filename").substring(properties.get("filename").lastIndexOf(".") + 1);
78    String target_filename = "doc." + ext;
79    this.properties.put("source","doc." + ext);
80    String assoc_filename = "D" + this.properties.get("docno");
81    if (assoc_filename.equals("Dnull"))
82    {
83      System.err.println("Error! Bogus assoc dir: " + this.properties.get("docno"));
84      System.exit(0);
85    }
86
87    this.properties.put("assocfile", assoc_filename);
88
89    // Copy (symlink) the file into place in the shared directory
90    Path source_path = Paths.get(properties.get("filename"));
91    Path assoc_path = Paths.get(ApplicationSetup.TERRIER_SHARE, "images", "assoc", assoc_filename);
92    assoc_path.toFile().mkdirs(); // bet there is a nice nio way to do this
93    Path target_path = assoc_path.resolve(target_filename);
94    if (target_path.toFile().exists())
95    {
96      logger.info("ImageDocument - removing existing (old) associated image");
97      try
98      {
99        Files.delete(target_path);
100      }
101      catch (Exception e)
102      {
103        logger.error("Exception while deleting old image: ", e);
104      }
105    }
106    logger.info("ImageDocument - symlinking image into assoc directory");
107    try
108    {
109      Files.createSymbolicLink(target_path, source_path);
110    }
111    // not supported? We'll try copying below
112    catch (UnsupportedOperationException ex)
113    {
114    }
115    // All other exceptions can be fatal
116    catch (Exception e)
117    {
118      logger.error("Exception while symlinking image: ", e);
119    }
120    // - copy if the file doesn't exist yet
121    if (!target_path.toFile().exists())
122    {
123      logger.info("ImageDocument - symlink filaed, copying instead");
124      try
125      {
126        Files.copy(source_path, target_path);
127      }
128      // Fatality!
129      catch (Exception e)
130      {
131        logger.error("Exception while copying image: ", e);
132      }
133    }
134
135    // Generate preview image
136    logger.info("ImageDocument - generate preview image");
137    try
138    {
139      String preview_filename = this.generatePreview(source_path, assoc_path);
140      this.properties.put("preview",preview_filename);
141    }
142    catch (Exception e)
143    {
144      logger.error("Exception while generating preview image: ", e);
145    }
146
147    // Create a dummy reader around some dummy text and then tokenize it
148    logger.info("ImageDocument - feed dummy text as token stream to indexer");
149    try
150    {
151      this.reader = new StringReader(this.properties.get("abstract"));
152      this.tokenizer = tok.tokenise(this.reader);
153    }
154    catch (Exception e)
155    {
156      logger.error("Exception while creating dummy text stream: ", e);
157    }
158    logger.info("ImageDocument - Complete!");
159  }
160  /** ImageDocument(InputStream istream, Map<String,String> default_properties, Tokeniser tok) **/
161
162  /** Returns true when the end of the document has been reached, and there
163   *  are no other terms to be retrieved from it.
164   *  @return boolean true if there are no more terms in the document, otherwise
165   *          it returns false.
166   */
167  public boolean endOfDocument()
168  {
169    return !this.tokenizer.hasNext();
170  }
171  /** endOfDocument() **/
172
173  /** Use ImageMagick to generate a preview image.
174   *  @pre assumes you have ImageMagick installed and available on Path
175   *  @pre uses member variables preview_format and preview_width
176   *  @return the filename of the preview image (within the assoc directory)
177   */
178  private String generatePreview(Path source_path, Path assoc_path)
179    throws Exception
180  {
181    String preview_filename = "preview." + this.preview_format;
182    Path preview_path = assoc_path.resolve(preview_filename);
183    String convert_command[] = {
184      "convert",
185      source_path.toString(),
186      "-resize",
187      this.preview_width + "x",
188      preview_path.toString()
189    };
190    logger.info("ImageDocument - convert command: " + Arrays.toString(convert_command));
191    Process convert_process = Runtime.getRuntime().exec(convert_command);
192    // Gobble up the streams to prevent them hanging the process when buffers
193    // are full
194    StreamGobbler convert_process_error_gobbler = new StreamGobbler(convert_process.getErrorStream());
195    convert_process_error_gobbler.start();
196    StreamGobbler convert_process_input_gobbler = new StreamGobbler(convert_process.getInputStream());
197    convert_process_input_gobbler.start();
198    // Let the conversion finish
199    int convert_status = convert_process.waitFor();
200    if (convert_status != 0 || !preview_path.toFile().exists())
201    {
202      throw new Exception("Convert command failed (exit status: " + convert_status + ")");
203    }
204    return preview_filename;
205  }
206  /** generatePreview(Path, Path) **/
207
208  /** Returns the underlying map of all the properties defined by this Document.
209   *  @since 1.1.0
210   */
211  public Map<String,String> getAllProperties()
212  {
213    return this.properties;
214  }
215  /** getAllProperties() **/
216
217  /** Returns a list of the fields the current term appears in.
218   * @return HashSet a set of the terms that the current term appears in.
219   */
220  public Set<String> getFields()
221  {
222    // Returns null because there is no support for fields with file documents.
223    return Collections.emptySet();
224  }
225  /** getFields() **/
226
227  /** Gets the next term of the document.
228   *  <B>NB:</B>Null string returned from getNextTerm() should
229   *  be ignored. They do not signify the lack of any more terms.
230   *  endOfDocument() should be used to check that.
231   *  @return String the next term of the document. Null returns should be
232   *          ignored.
233   */
234  public String getNextTerm()
235  {
236    return this.tokenizer.next();
237  }
238  /** getNextTerm() **/
239
240  /** Allows access to a named property of the Document. Examples might be URL,
241   *  filename etc.
242   *  @param name Name of the property. It is suggested, but not required that
243   *         this name should not be case insensitive.
244   *  @since 1.1.0
245   */
246  public String getProperty(String name)
247  {
248    return this.properties.get(name.toLowerCase());
249  }
250  /** getProperty(String name) **/
251
252  /** Returns a Reader object so client code can tokenise the document
253   * or deal with the document itself. Examples might be extracting URLs,
254   * language detection. */
255  public Reader getReader()
256  {
257    return this.reader;
258  }
259  /** getReader() **/
260}
261
Note: See TracBrowser for help on using the browser.