source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/FileDocument.java@ 26193

Last change on this file since 26193 was 26193, checked in by jmt12, 12 years ago

Not essential, but here is a replacement FileDocument that is smart enough to use the filename as a title if no other title metadata was found

File size: 7.9 KB
Line 
1/*
2 * Terrier - Terabyte Retriever
3 * Webpage: http://terrier.org
4 * Contact: terrier{a.}dcs.gla.ac.uk
5 * University of Glasgow - School of Computing Science
6 * http://www.gla.ac.uk/
7 *
8 * The contents of this file are subject to the Mozilla Public License
9 * Version 1.1 (the "License"); you may not use this file except in
10 * compliance with the License. You may obtain a copy of the License at
11 * http://www.mozilla.org/MPL/
12 *
13 * Software distributed under the License is distributed on an "AS IS"
14 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
15 * the License for the specific language governing rights and limitations
16 * under the License.
17 *
18 * The Original Code is FileDocument.java.
19 *
20 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
21 * All Rights Reserved.
22 *
23 * Contributor(s):
24 * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
25 * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
26 */
27package org.terrier.indexing;
28import java.io.BufferedReader;
29import java.io.IOException;
30import java.io.InputStream;
31import java.io.InputStreamReader;
32import java.io.Reader;
33import java.util.Collections;
34import java.util.HashMap;
35import java.util.Map;
36import java.util.Set;
37
38import org.apache.log4j.Logger;
39import org.terrier.indexing.tokenisation.TokenStream;
40import org.terrier.indexing.tokenisation.Tokeniser;
41import org.terrier.utility.ApplicationSetup;
42/**
43 * Models a document which corresponds to one file. The first FileDocument.abstract.length characters
44 * can be saved as an abstract.
45 * @author Craig Macdonald, Vassilis Plachouras, Richard McCreadie, Rodrygo Santos
46 */
47public class FileDocument implements Document {
48 protected static final Logger logger = Logger.getLogger(FileDocument.class);
49 /** The maximum number of digits that are allowed in valid terms. */
50 /** The input reader. */
51 protected Reader br;
52 /** End of Document. Set by the last couple of lines in getNextTerm() */
53 protected boolean EOD = false;
54
55 /** The number of bytes read from the input.*/
56 public long counter = 0;
57
58 protected Map<String,String> fileProperties;
59
60 /** The name of the file represented by this document. */
61 protected String filename;
62
63 protected TokenStream tokenStream;
64
65 protected FileDocument() {}
66
67 /** The names of the abstracts to be saved (comma separated list) **/
68 protected final String abstractname = ApplicationSetup.getProperty("FileDocument.abstract", "");
69 /** The maximum length of each named abstract (comma separated list) **/
70 protected final int abstractlength = Integer.parseInt(ApplicationSetup.getProperty("FileDocument.abstract.length", "0"));
71 /** The number of characters currently written **/
72 protected int abstractwritten = 0;
73 /** The current abstract text **/
74 StringBuilder abstractText = new StringBuilder();
75
76 protected static Map<String,String> makeFilenameProperties(String filename)
77 {
78 Map<String,String> docProperties = new HashMap<String,String>();
79 docProperties.put("filename", filename);
80 return docProperties;
81 }
82 /**
83 * create a document for a file
84 * @param _filename
85 * @param docReader
86 * @param tok
87 */
88 public FileDocument(String _filename, Reader docReader, Tokeniser tok) {
89 this(docReader, makeFilenameProperties(_filename), tok);
90 }
91 /**
92 * create a document for a file
93 * @param _filename
94 * @param docStream
95 * @param tok
96 */
97 public FileDocument(String _filename, InputStream docStream, Tokeniser tok) {
98 this(docStream, makeFilenameProperties(_filename), tok);
99 }
100 /**
101 * create a document for a file
102 * @param docReader
103 * @param docProperties
104 * @param tok
105 */
106 public FileDocument(Reader docReader, Map<String,String> docProperties, Tokeniser tok) {
107 this.br = docReader;
108 this.fileProperties = docProperties;
109 this.fileProperties.put("parser", this.getClass().getName());
110 this.filename = docProperties.get("filename");
111 try{
112 //do we have abstract enabled?
113 if (abstractname.length() != 0)
114 tokenStream = tok.tokenise(new ReaderWrapper(this.br));
115 else
116 tokenStream = tok.tokenise(this.br);
117 } catch (Exception e) {
118 throw new RuntimeException();
119 }
120 }
121
122 /**
123 * Constructs an instance of the FileDocument from the
124 * given input stream.
125 * @param docStream the input stream that reads the file.
126 */
127 public FileDocument(InputStream docStream, Map<String,String> docProperties, Tokeniser tok) {
128
129 logger.debug("FileDocument::FileDocument()");
130 logger.debug("docno: " + docProperties.get("docno"));
131 logger.debug("filename: " + docProperties.get("filename"));
132
133 this.fileProperties = docProperties;
134 this.filename = docProperties.get("filename");
135
136 // [jmt12] Create title from filename
137 int sep_pos = this.filename.lastIndexOf(System.getProperty("file.separator"));
138 String title = this.filename;
139 if (sep_pos >= 0)
140 {
141 title = this.filename.substring(sep_pos + 1);
142 }
143 this.fileProperties.put("title", title);
144
145 this.br = getReader(docStream);
146 this.fileProperties.put("parser", this.getClass().getName());
147 try{
148 //do we have abstract enabled?
149 if (abstractname.length() != 0)
150 tokenStream = tok.tokenise(new ReaderWrapper(this.br));
151 else
152 tokenStream = tok.tokenise(this.br);
153 } catch (Exception e) {
154 throw new RuntimeException();
155 }
156 }
157
158 /**
159 * A wrapper around the token stream used to lift the terms from the stream
160 * for storage in the abstract
161 * @author Richard McCreadie
162 * @since 3.5
163 */
164 public class ReaderWrapper extends Reader {
165
166 Reader underlyingStream;
167
168 /**
169 * create a wraper for token stream
170 * @param stream
171 */
172 public ReaderWrapper(Reader stream) {
173 underlyingStream = stream;
174 }
175
176 @Override
177 public int read() throws IOException {
178 final int readChar = underlyingStream.read();
179 if (abstractwritten<abstractlength) {
180 abstractText.append(((char)readChar));
181 abstractwritten++;
182 }
183 if (readChar==-1)
184 {
185 setProperty(abstractname, abstractText.toString());
186 }
187 return readChar;
188 }
189
190 @Override
191 public int read(char[] cbuf, int off, int len) throws IOException {
192 final int readChar = underlyingStream.read(cbuf,off,len);
193 if (abstractwritten<abstractlength) {
194 abstractText.append(cbuf, off, len);
195 abstractwritten++;
196 }
197 if (readChar==-1)
198 {
199 setProperty(abstractname, abstractText.toString());
200 }
201 return readChar;
202 }
203
204 @Override
205 public void close() throws IOException {
206 underlyingStream.close();
207 }
208
209 }
210
211
212
213 /** Returns the underlying buffered reader, so that client code can tokenise the
214 * document itself, and deal with it how it likes. */
215 public Reader getReader()
216 {
217 return this.br;
218 }
219
220
221 /**
222 * Returns a buffered reader that encapsulates the
223 * given input stream.
224 * @param docStream an input stream that we want to
225 * access as a buffered reader.
226 * @return the buffered reader that encapsulates the
227 * given input stream.
228 */
229 protected Reader getReader(InputStream docStream) {
230 return new BufferedReader(new InputStreamReader(docStream));
231 }
232
233 /**Gets the next term from the Document */
234 public String getNextTerm()
235 {
236 return tokenStream.next();
237 }
238 /**
239 * Returns null because there is no support for fields with
240 * file documents.
241 * @return null.
242 */
243 public Set<String> getFields() {
244 return Collections.emptySet();
245 }
246 /**
247 * Indicates whether the end of a document has been reached.
248 * @return boolean true if the end of a document has been reached,
249 * otherwise, it returns false.
250 */
251 public boolean endOfDocument() {
252 return ! tokenStream.hasNext();
253 }
254 /**
255 * Get a document property
256 */
257 public String getProperty(String name){
258 return fileProperties.get(name.toLowerCase());
259 }
260 /**
261 * Set a document property
262 */
263 public void setProperty(String name, String value)
264 {
265 fileProperties.put(name.toLowerCase(),value);
266 }
267 /**
268 * {@inheritDoc}
269 */
270 public Map<String,String> getAllProperties(){
271 return fileProperties;
272 }
273}
Note: See TracBrowser for help on using the repository browser.