source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/SimpleFileCollection.java@ 26209

Last change on this file since 26209 was 26209, checked in by jmt12, 12 years ago

Gah - forgot I'd customized the SimpleFileCollection to ensure docno is available in the initial properties of the document (before it is indexed)

File size: 14.9 KB
Line 
1/*
2 * Terrier - Terabyte Retriever
3 * Webpage: http://terrier.org
4 * Contact: terrier{a.}dcs.gla.ac.uk
5 * University of Glasgow - School of Computing Science
6 * http://www.gla.ac.uk/
7 *
8 * The contents of this file are subject to the Mozilla Public License
9 * Version 1.1 (the "License"); you may not use this file except in
10 * compliance with the License. You may obtain a copy of the License at
11 * http://www.mozilla.org/MPL/
12 *
13 * Software distributed under the License is distributed on an "AS IS"
14 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
15 * the License for the specific language governing rights and limitations
16 * under the License.
17 *
18 * The Original Code is SimpleFileCollection.java.
19 *
20 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
21 * All Rights Reserved.
22 *
23 * Contributor(s):
24 * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
25 * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
26 */
27package org.terrier.indexing;
28import java.io.BufferedReader;
29import java.io.IOException;
30import java.io.InputStream;
31import java.util.ArrayList;
32import java.util.HashMap;
33import java.util.LinkedList;
34import java.util.List;
35import java.util.Map;
36
37import org.apache.log4j.Logger;
38import org.terrier.indexing.tokenisation.Tokeniser;
39import org.terrier.utility.ApplicationSetup;
40import org.terrier.utility.Files;
41/**
42 * Implements a collection that can read arbitrary files on disk. It will
43 * use the file list given to it in the constructor, or it will read the
44 * file specified by the property <tt>collection.spec</tt>.
45 * <b>Properties:</b>
46 * <ul>
47 * <li><tt>indexing.simplefilecollection.extensionsparsers</tt> - a comma delimited lists of tuples, in the form "extension:DocumentClass".
48 * For instance, one tuple could be "txt:FileDocument". The default <tt>txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument,pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument</tt>.
49 * </li>
50 * <li><tt>indexing.simplefilecollection.defaultparser</tt> - the default parser for any unknown extensions.
51 * If this property is empty, then such documents will not be opened.</li>
52 * <li><tt>indexing.simplefilecollection.recurse</tt> - whether directories should be opened looking for
53 * files.</li>
54 * </ul>
55 * @author Craig Macdonald &amp; Vassilis Plachouras
56 */
57public class SimpleFileCollection implements Collection/*, DocumentExtractor*/
58{
59 protected static final Logger logger = Logger.getLogger(SimpleFileCollection.class);
60 /** The default namespace for all parsers to be loaded from. Only used if
61 * the class name specified does not contain any periods ('.') */
62 public final static String NAMESPACE_DOCUMENTS = "org.terrier.indexing.";
63
64 /** The list of files to index.*/
65 protected LinkedList<String> FileList = new LinkedList<String>();
66
67 /** Contains the list of files first handed to the SimpleFileCollection, allowing
68 * the SimpleFileCollection instance to be simply reset. */
69 protected List<String> firstList;
70
71 /** This is filled during traversal, so document IDs can be matched with filenames */
72 protected List<String> indexedFiles = new ArrayList<String>();
73
74 /** The identifier of a document in the collection.*/
75 protected int Docid = 0;
76
77 /** Whether directories should be recursed into by this class */
78 protected boolean Recurse = Boolean.parseBoolean(ApplicationSetup.getProperty("indexing.simplefilecollection.recurse", "true"));
79
80 /** Maps filename extensions to Document classes.
81 * The entry |DEFAULT| maps to the default document parser, specified
82 * by <tt>indexing.simplefilecollection.defaultparser</tt> */
83 protected Map<String,Class<? extends Document>> extension_DocumentClass = new HashMap<String,Class<? extends Document>>();
84
85 /** The filename of the current file we are processing. */
86 protected String thisFilename;
87
88 /** The InputStream of the most recently opened document. This
89 * is used to ensure that files are closed once they have been
90 * finished reading. */
91 protected InputStream currentStream = null;
92
93 protected Tokeniser tokeniser = Tokeniser.getTokeniser();
94
95 /**
96 * Constructs an instance of the class with the given list of files.
97 * @param filelist ArrayList the files to be processed by this collection.
98 */
99 public SimpleFileCollection(List<String> filelist, boolean recurse) {
100 FileList = new LinkedList<String>(filelist);
101 //keep a backup copy for reset()
102 firstList = new LinkedList<String>(filelist);
103 createExtensionDocumentMapping();
104 }
105
106 /**
107 * A default constructor that uses the files to be processed
108 * by this collection, as specified by the property
109 * <tt>collection.spec</tt>
110 */
111 public SimpleFileCollection()
112 {
113 this(ApplicationSetup.COLLECTION_SPEC);
114 }
115
116
117 /**
118 * Creates an instance of the class. The files to be processed are
119 * specified in the file with the given name.
120 * @param addressCollectionFilename String the name of the file that
121 * contains the list of files to be processed by this collecion.
122 */
123 public SimpleFileCollection(String addressCollectionFilename)
124 {
125 ArrayList<String> generatedFileList = new ArrayList<String>();
126 try{
127 //opening the address_collection file
128 BufferedReader br = Files.openFileReader(addressCollectionFilename);
129 //iterate through each entry of the address_collection file
130 String filename = br.readLine();
131 while (filename != null) {
132 //if the line starts with #, then assume it is
133 //a comment and proceed to the next one
134 if (filename.startsWith("#")) {
135 filename = br.readLine();
136 continue;
137 }
138 if(logger.isDebugEnabled()){
139 logger.debug("Added "+filename+" to filelist for SimpleFileCollection");
140 }
141 generatedFileList.add(filename);
142 filename = br.readLine();
143 }
144
145 }catch(IOException ioe) {
146 logger.error("problem opening address list of files in SimpleFileCollectio: ",ioe);
147 }
148 FileList = new LinkedList<String>(generatedFileList);
149 firstList = new LinkedList<String>(generatedFileList);
150 createExtensionDocumentMapping();
151 }
152
153
154 /** Parses the properties <tt>indexing.simplefilecollection.extensionsparsers</tt>
155 * and <tt>indexing.simplefilecollection.defaultparser</tt> and attempts to load
156 * all the mentioned classes, in a hashtable mapping filename extension to their
157 * respective parsers. If <tt>indexing.simplefilecollection.defaultparser</tt>
158 * is set, then that class will be used to attempt to parse documents that no
159 * explicit parser is set. */
160 protected void createExtensionDocumentMapping()
161 {
162 String staticMappings = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers",
163 "txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument," +
164 "pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,"+
165 "doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument");
166 String defaultMapping = ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","");
167 if (staticMappings.length() > 0)
168 {
169 String[] mappings = staticMappings.split("\\s*,\\s*");
170 for(int i=0;i<mappings.length;i++)
171 {
172 if (mappings[i].indexOf(":") < 1)
173 continue;
174 String[] mapping = mappings[i].split(":");
175 if (mapping.length == 2 && mapping[0].length() > 0
176 && mapping[1].length() > 0)
177 {
178 if (mapping[1].indexOf(".") == -1)
179 mapping[1] = NAMESPACE_DOCUMENTS + mapping[1];
180 else if (mapping[1].startsWith("uk.ac.gla.terrier"))
181 mapping[1] = mapping[1].replaceAll("uk.ac.gla.terrier", "org.terrier");
182 try{
183 Class<? extends Document> d = Class.forName(mapping[1], false, this.getClass().getClassLoader()).asSubclass(Document.class);
184 extension_DocumentClass.put(mapping[0].toLowerCase(), d);
185 }catch (Exception e){
186 /*warning, just ignore */
187 logger.warn("Missing class " + mapping[1] + " for " +
188 mapping[0].toLowerCase() + " files.",e);
189 }
190 }
191 }
192 }
193 //set the mapping for the default parser
194 if (!defaultMapping.equals("")) {
195 if (defaultMapping.indexOf(".") == -1)
196 defaultMapping = NAMESPACE_DOCUMENTS + defaultMapping;
197 else if (defaultMapping.startsWith("uk.ac.gla.terrier"))
198 defaultMapping = defaultMapping.replaceAll("uk.ac.gla.terrier", "org.terrier");
199
200 try{
201 Class<? extends Document> d = Class.forName(defaultMapping, false, this.getClass().getClassLoader()).asSubclass(Document.class);
202 extension_DocumentClass.put("|DEFAULT|", d);
203 }catch (Exception e){
204 logger.warn("Missing default class " + defaultMapping, e);
205 }
206 }
207 }
208
209 /**
210 * Check whether there is a next document in the collection to be processed
211 * @return has next
212 */
213 public boolean hasNext() {
214 return ! endOfCollection();
215 }
216
217 /**
218 * Move onto the next document in the collection to be processed.
219 * @return next document
220 */
221 public Document next()
222 {
223 nextDocument();
224 return getDocument();
225 }
226
227 /**
228 * This is unsupported by this Collection implementation, and
229 * any calls will throw UnsupportedOperationException
230 * Throws UnsupportedOperationException on all invocations */
231 public void remove()
232 {
233 throw new UnsupportedOperationException("Iterator.remove() not supported");
234 }
235
236 /**
237 * Move onto the next document in the collection to be processed.
238 * @return boolean true if there are more documents in the collection,
239 * otherwise return false.*/
240 public boolean nextDocument()
241 {
242 if (FileList.size() == 0)
243 return false;
244 boolean rtr = false;
245 thisFilename = null;
246 while(FileList.size() > 0 && ! rtr)
247 {
248 thisFilename = FileList.removeFirst();
249 logger.info("NEXT: "+thisFilename);
250
251 if (! Files.exists(thisFilename) || ! Files.canRead(thisFilename) )
252 {
253 if (! Files.exists(thisFilename))
254 logger.warn("File doesn't exist: "+thisFilename);
255 else if (! Files.canRead(thisFilename) )
256 logger.warn("File cannot be read: "+thisFilename);
257 rtr = nextDocument();
258 }
259 else if (Files.isDirectory(thisFilename))
260 {
261 //we're allowed to recurse into directories
262 if(Recurse)
263 addDirectoryListing();
264 }
265 else
266 { //this file is fine - use it!
267 //this block ensures that DocId is only increased once per file
268 Docid++;
269 rtr = true;
270 }
271 }//loop ends
272 return rtr;
273 }
274 /**
275 * Return the current document in the collection.
276 * @return Document the next document object from the collection.
277 */
278 public Document getDocument()
279 {
280 InputStream in = null;
281 if (currentStream != null)
282 {
283 try{
284 currentStream.close();
285 currentStream = null;
286 }catch (IOException ioe) {
287 logger.warn("IOException while closing file being read", ioe);
288 }
289 }
290 if (thisFilename == null)
291 {
292 return null;
293 }
294 String filename = null;
295 try{
296 in = Files.openFileStream(thisFilename);
297 filename = thisFilename.replaceAll("\\.gz$","");
298 }catch(IOException ioe){
299 logger.warn("Problem reading "+thisFilename+" in "+
300 "SimpleFileCollection.getDocuent() : ",ioe);
301 }
302 currentStream = in;
303 return makeDocument(filename, in);
304
305 }
306
307
308 /** Given the opened document in, of Filename and File f, work out which
309 * parser to try, and instantiate it. If you wish to use a different
310 * constructor for opening documents, then you need to subclass this method.
311 * @param Filename the filename of the currently open document
312 * @param in The stream of the currently open document
313 * @return Document object to parse the document, or null if no suitable parser
314 * exists.*/
315 protected Document makeDocument(String Filename, InputStream in)
316 {
317 if (Filename == null || in == null)
318 return null;
319 String[] splitStr = Filename.split("\\.");
320 String ext = splitStr[splitStr.length-1].toLowerCase();
321 Class<? extends Document> reader = extension_DocumentClass.get(ext);
322 Document rtr = null;
323
324 /*If a document doesn't have an associated parser,
325 check the default one */
326 if (reader == null) {
327 reader = extension_DocumentClass.get("|DEFAULT|");
328 }
329 /*if there is no default parser, then tough luck for that file,
330 but it's ignored */
331 if (reader == null) {
332 logger.warn("No available parser for file " + Filename + ", file is ignored.");
333 return null;
334 }
335 logger.debug("Using "+reader.getName() + " to read "+ Filename);
336
337 /* now attempt to instantiate the class */
338 try{
339 Map<String,String> docProperties = new HashMap<String,String>(5);
340 // [jmt12] I need the Docid in the Document instance
341 docProperties.put("docno", this.getDocid());
342 docProperties.put("filename", Filename);
343 //and instantiate
344 rtr = reader.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(in, docProperties, tokeniser);
345 indexedFiles.add(thisFilename);
346 }catch (OutOfMemoryError e){
347 logger.warn("Problem instantiating a document class; Out of memory error occured: ",e);
348 System.gc();
349 }catch (StackOverflowError e){
350 logger.warn("Problem instantiating a document class; Stack Overflow error occured: ",e);
351 }catch (Exception e){
352 logger.warn("Problem instantiating a document class: ",e);
353 }
354 rtr.getAllProperties().put("docno", this.getDocid());
355 return rtr;
356 }
357
358 /**
359 * Checks whether there are more documents in the colection.
360 * @return boolean true if there are no more documents in the collection,
361 * otherwise it returns false.
362 */
363 public boolean endOfCollection()
364 {
365 return (FileList.size() == 0);
366 }
367
368 /**
369 * Starts again from the beginning of the collection.
370 */
371 public void reset()
372 {
373 Docid = 0;
374 FileList = new LinkedList<String>(firstList);
375 indexedFiles = new ArrayList<String>();
376 }
377
378 /**
379 * Returns the current document's identifier string.
380 * @return String the identifier of the current document.
381 */
382 public String getDocid()
383 {
384 return Docid+"";
385 }
386
387 @Override
388 public void close()
389 {
390 if (currentStream != null)
391 {
392 try{
393 currentStream.close();
394 } catch (IOException ioe) {
395 logger.error("Exception occured while trying to close an IO stream",ioe);
396 }
397 }
398 }
399
400 /** Returns the ist of indexed files in the order they were indexed in. */
401 public List<String> getFileList()
402 {
403 return indexedFiles;
404 }
405
406 /** Called when <tt>thisFile</tt> is identified as a directory, this adds the entire
407 * contents of the directory onto the list to be processed. */
408 protected void addDirectoryListing()
409 {
410 //File[] contents = thisFile.listFiles();
411 String[] dirContents = Files.list( thisFilename );
412 if (dirContents == null)
413 return;
414 for(String e : dirContents)
415 {
416 if (e.equals(".") || e.equals(".."))
417 continue;
418 FileList.add(thisFilename + ApplicationSetup.FILE_SEPARATOR + e);
419 }
420 /*for(int i=0;i<contents.length;i++)
421 {
422 FileList.add(contents[i].getAbsolutePath());
423 }*/
424 }
425
426 /**
427 * Simple test case. Pass the filename of a file that lists files
428 * to be processed to this test case.
429 */
430 public static void main(String[] args) {
431 Indexer in = new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
432 in.createDirectIndex(new Collection[] {new SimpleFileCollection(args[0])});
433 in.createInvertedIndex();
434 }
435
436}
Note: See TracBrowser for help on using the repository browser.