[26209] | 1 | /*
|
---|
| 2 | * Terrier - Terabyte Retriever
|
---|
| 3 | * Webpage: http://terrier.org
|
---|
| 4 | * Contact: terrier{a.}dcs.gla.ac.uk
|
---|
| 5 | * University of Glasgow - School of Computing Science
|
---|
| 6 | * http://www.gla.ac.uk/
|
---|
| 7 | *
|
---|
| 8 | * The contents of this file are subject to the Mozilla Public License
|
---|
| 9 | * Version 1.1 (the "License"); you may not use this file except in
|
---|
| 10 | * compliance with the License. You may obtain a copy of the License at
|
---|
| 11 | * http://www.mozilla.org/MPL/
|
---|
| 12 | *
|
---|
| 13 | * Software distributed under the License is distributed on an "AS IS"
|
---|
| 14 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
|
---|
| 15 | * the License for the specific language governing rights and limitations
|
---|
| 16 | * under the License.
|
---|
| 17 | *
|
---|
| 18 | * The Original Code is SimpleFileCollection.java.
|
---|
| 19 | *
|
---|
| 20 | * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
|
---|
| 21 | * All Rights Reserved.
|
---|
| 22 | *
|
---|
| 23 | * Contributor(s):
|
---|
| 24 | * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
|
---|
| 25 | * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
|
---|
| 26 | */
|
---|
| 27 | package org.terrier.indexing;
|
---|
| 28 | import java.io.BufferedReader;
|
---|
| 29 | import java.io.IOException;
|
---|
| 30 | import java.io.InputStream;
|
---|
| 31 | import java.util.ArrayList;
|
---|
| 32 | import java.util.HashMap;
|
---|
| 33 | import java.util.LinkedList;
|
---|
| 34 | import java.util.List;
|
---|
| 35 | import java.util.Map;
|
---|
| 36 |
|
---|
| 37 | import org.apache.log4j.Logger;
|
---|
| 38 | import org.terrier.indexing.tokenisation.Tokeniser;
|
---|
| 39 | import org.terrier.utility.ApplicationSetup;
|
---|
| 40 | import org.terrier.utility.Files;
|
---|
| 41 | /**
|
---|
| 42 | * Implements a collection that can read arbitrary files on disk. It will
|
---|
| 43 | * use the file list given to it in the constructor, or it will read the
|
---|
| 44 | * file specified by the property <tt>collection.spec</tt>.
|
---|
| 45 | * <b>Properties:</b>
|
---|
| 46 | * <ul>
|
---|
| 47 | * <li><tt>indexing.simplefilecollection.extensionsparsers</tt> - a comma delimited lists of tuples, in the form "extension:DocumentClass".
|
---|
| 48 | * For instance, one tuple could be "txt:FileDocument". The default <tt>txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument,pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument</tt>.
|
---|
| 49 | * </li>
|
---|
| 50 | * <li><tt>indexing.simplefilecollection.defaultparser</tt> - the default parser for any unknown extensions.
|
---|
| 51 | * If this property is empty, then such documents will not be opened.</li>
|
---|
| 52 | * <li><tt>indexing.simplefilecollection.recurse</tt> - whether directories should be opened looking for
|
---|
| 53 | * files.</li>
|
---|
| 54 | * </ul>
|
---|
| 55 | * @author Craig Macdonald & Vassilis Plachouras
|
---|
| 56 | */
|
---|
| 57 | public class SimpleFileCollection implements Collection/*, DocumentExtractor*/
|
---|
| 58 | {
|
---|
| 59 | protected static final Logger logger = Logger.getLogger(SimpleFileCollection.class);
|
---|
| 60 | /** The default namespace for all parsers to be loaded from. Only used if
|
---|
| 61 | * the class name specified does not contain any periods ('.') */
|
---|
| 62 | public final static String NAMESPACE_DOCUMENTS = "org.terrier.indexing.";
|
---|
| 63 |
|
---|
| 64 | /** The list of files to index.*/
|
---|
| 65 | protected LinkedList<String> FileList = new LinkedList<String>();
|
---|
| 66 |
|
---|
| 67 | /** Contains the list of files first handed to the SimpleFileCollection, allowing
|
---|
| 68 | * the SimpleFileCollection instance to be simply reset. */
|
---|
| 69 | protected List<String> firstList;
|
---|
| 70 |
|
---|
| 71 | /** This is filled during traversal, so document IDs can be matched with filenames */
|
---|
| 72 | protected List<String> indexedFiles = new ArrayList<String>();
|
---|
| 73 |
|
---|
| 74 | /** The identifier of a document in the collection.*/
|
---|
| 75 | protected int Docid = 0;
|
---|
| 76 |
|
---|
| 77 | /** Whether directories should be recursed into by this class */
|
---|
| 78 | protected boolean Recurse = Boolean.parseBoolean(ApplicationSetup.getProperty("indexing.simplefilecollection.recurse", "true"));
|
---|
| 79 |
|
---|
| 80 | /** Maps filename extensions to Document classes.
|
---|
| 81 | * The entry |DEFAULT| maps to the default document parser, specified
|
---|
| 82 | * by <tt>indexing.simplefilecollection.defaultparser</tt> */
|
---|
| 83 | protected Map<String,Class<? extends Document>> extension_DocumentClass = new HashMap<String,Class<? extends Document>>();
|
---|
| 84 |
|
---|
| 85 | /** The filename of the current file we are processing. */
|
---|
| 86 | protected String thisFilename;
|
---|
| 87 |
|
---|
| 88 | /** The InputStream of the most recently opened document. This
|
---|
| 89 | * is used to ensure that files are closed once they have been
|
---|
| 90 | * finished reading. */
|
---|
| 91 | protected InputStream currentStream = null;
|
---|
| 92 |
|
---|
| 93 | protected Tokeniser tokeniser = Tokeniser.getTokeniser();
|
---|
| 94 |
|
---|
| 95 | /**
|
---|
| 96 | * Constructs an instance of the class with the given list of files.
|
---|
| 97 | * @param filelist ArrayList the files to be processed by this collection.
|
---|
| 98 | */
|
---|
| 99 | public SimpleFileCollection(List<String> filelist, boolean recurse) {
|
---|
| 100 | FileList = new LinkedList<String>(filelist);
|
---|
| 101 | //keep a backup copy for reset()
|
---|
| 102 | firstList = new LinkedList<String>(filelist);
|
---|
| 103 | createExtensionDocumentMapping();
|
---|
| 104 | }
|
---|
| 105 |
|
---|
| 106 | /**
|
---|
| 107 | * A default constructor that uses the files to be processed
|
---|
| 108 | * by this collection, as specified by the property
|
---|
| 109 | * <tt>collection.spec</tt>
|
---|
| 110 | */
|
---|
| 111 | public SimpleFileCollection()
|
---|
| 112 | {
|
---|
| 113 | this(ApplicationSetup.COLLECTION_SPEC);
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 |
|
---|
| 117 | /**
|
---|
| 118 | * Creates an instance of the class. The files to be processed are
|
---|
| 119 | * specified in the file with the given name.
|
---|
| 120 | * @param addressCollectionFilename String the name of the file that
|
---|
| 121 | * contains the list of files to be processed by this collecion.
|
---|
| 122 | */
|
---|
| 123 | public SimpleFileCollection(String addressCollectionFilename)
|
---|
| 124 | {
|
---|
| 125 | ArrayList<String> generatedFileList = new ArrayList<String>();
|
---|
| 126 | try{
|
---|
| 127 | //opening the address_collection file
|
---|
| 128 | BufferedReader br = Files.openFileReader(addressCollectionFilename);
|
---|
| 129 | //iterate through each entry of the address_collection file
|
---|
| 130 | String filename = br.readLine();
|
---|
| 131 | while (filename != null) {
|
---|
| 132 | //if the line starts with #, then assume it is
|
---|
| 133 | //a comment and proceed to the next one
|
---|
| 134 | if (filename.startsWith("#")) {
|
---|
| 135 | filename = br.readLine();
|
---|
| 136 | continue;
|
---|
| 137 | }
|
---|
| 138 | if(logger.isDebugEnabled()){
|
---|
| 139 | logger.debug("Added "+filename+" to filelist for SimpleFileCollection");
|
---|
| 140 | }
|
---|
| 141 | generatedFileList.add(filename);
|
---|
| 142 | filename = br.readLine();
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | }catch(IOException ioe) {
|
---|
| 146 | logger.error("problem opening address list of files in SimpleFileCollectio: ",ioe);
|
---|
| 147 | }
|
---|
| 148 | FileList = new LinkedList<String>(generatedFileList);
|
---|
| 149 | firstList = new LinkedList<String>(generatedFileList);
|
---|
| 150 | createExtensionDocumentMapping();
|
---|
| 151 | }
|
---|
| 152 |
|
---|
| 153 |
|
---|
| 154 | /** Parses the properties <tt>indexing.simplefilecollection.extensionsparsers</tt>
|
---|
| 155 | * and <tt>indexing.simplefilecollection.defaultparser</tt> and attempts to load
|
---|
| 156 | * all the mentioned classes, in a hashtable mapping filename extension to their
|
---|
| 157 | * respective parsers. If <tt>indexing.simplefilecollection.defaultparser</tt>
|
---|
| 158 | * is set, then that class will be used to attempt to parse documents that no
|
---|
| 159 | * explicit parser is set. */
|
---|
| 160 | protected void createExtensionDocumentMapping()
|
---|
| 161 | {
|
---|
| 162 | String staticMappings = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers",
|
---|
| 163 | "txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument," +
|
---|
| 164 | "pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,"+
|
---|
| 165 | "doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument");
|
---|
| 166 | String defaultMapping = ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","");
|
---|
| 167 | if (staticMappings.length() > 0)
|
---|
| 168 | {
|
---|
| 169 | String[] mappings = staticMappings.split("\\s*,\\s*");
|
---|
| 170 | for(int i=0;i<mappings.length;i++)
|
---|
| 171 | {
|
---|
| 172 | if (mappings[i].indexOf(":") < 1)
|
---|
| 173 | continue;
|
---|
| 174 | String[] mapping = mappings[i].split(":");
|
---|
| 175 | if (mapping.length == 2 && mapping[0].length() > 0
|
---|
| 176 | && mapping[1].length() > 0)
|
---|
| 177 | {
|
---|
| 178 | if (mapping[1].indexOf(".") == -1)
|
---|
| 179 | mapping[1] = NAMESPACE_DOCUMENTS + mapping[1];
|
---|
| 180 | else if (mapping[1].startsWith("uk.ac.gla.terrier"))
|
---|
| 181 | mapping[1] = mapping[1].replaceAll("uk.ac.gla.terrier", "org.terrier");
|
---|
| 182 | try{
|
---|
| 183 | Class<? extends Document> d = Class.forName(mapping[1], false, this.getClass().getClassLoader()).asSubclass(Document.class);
|
---|
| 184 | extension_DocumentClass.put(mapping[0].toLowerCase(), d);
|
---|
| 185 | }catch (Exception e){
|
---|
| 186 | /*warning, just ignore */
|
---|
| 187 | logger.warn("Missing class " + mapping[1] + " for " +
|
---|
| 188 | mapping[0].toLowerCase() + " files.",e);
|
---|
| 189 | }
|
---|
| 190 | }
|
---|
| 191 | }
|
---|
| 192 | }
|
---|
| 193 | //set the mapping for the default parser
|
---|
| 194 | if (!defaultMapping.equals("")) {
|
---|
| 195 | if (defaultMapping.indexOf(".") == -1)
|
---|
| 196 | defaultMapping = NAMESPACE_DOCUMENTS + defaultMapping;
|
---|
| 197 | else if (defaultMapping.startsWith("uk.ac.gla.terrier"))
|
---|
| 198 | defaultMapping = defaultMapping.replaceAll("uk.ac.gla.terrier", "org.terrier");
|
---|
| 199 |
|
---|
| 200 | try{
|
---|
| 201 | Class<? extends Document> d = Class.forName(defaultMapping, false, this.getClass().getClassLoader()).asSubclass(Document.class);
|
---|
| 202 | extension_DocumentClass.put("|DEFAULT|", d);
|
---|
| 203 | }catch (Exception e){
|
---|
| 204 | logger.warn("Missing default class " + defaultMapping, e);
|
---|
| 205 | }
|
---|
| 206 | }
|
---|
| 207 | }
|
---|
| 208 |
|
---|
| 209 | /**
|
---|
| 210 | * Check whether there is a next document in the collection to be processed
|
---|
| 211 | * @return has next
|
---|
| 212 | */
|
---|
| 213 | public boolean hasNext() {
|
---|
| 214 | return ! endOfCollection();
|
---|
| 215 | }
|
---|
| 216 |
|
---|
| 217 | /**
|
---|
| 218 | * Move onto the next document in the collection to be processed.
|
---|
| 219 | * @return next document
|
---|
| 220 | */
|
---|
| 221 | public Document next()
|
---|
| 222 | {
|
---|
| 223 | nextDocument();
|
---|
| 224 | return getDocument();
|
---|
| 225 | }
|
---|
| 226 |
|
---|
| 227 | /**
|
---|
| 228 | * This is unsupported by this Collection implementation, and
|
---|
| 229 | * any calls will throw UnsupportedOperationException
|
---|
| 230 | * Throws UnsupportedOperationException on all invocations */
|
---|
| 231 | public void remove()
|
---|
| 232 | {
|
---|
| 233 | throw new UnsupportedOperationException("Iterator.remove() not supported");
|
---|
| 234 | }
|
---|
| 235 |
|
---|
| 236 | /**
|
---|
| 237 | * Move onto the next document in the collection to be processed.
|
---|
| 238 | * @return boolean true if there are more documents in the collection,
|
---|
| 239 | * otherwise return false.*/
|
---|
| 240 | public boolean nextDocument()
|
---|
| 241 | {
|
---|
| 242 | if (FileList.size() == 0)
|
---|
| 243 | return false;
|
---|
| 244 | boolean rtr = false;
|
---|
| 245 | thisFilename = null;
|
---|
| 246 | while(FileList.size() > 0 && ! rtr)
|
---|
| 247 | {
|
---|
| 248 | thisFilename = FileList.removeFirst();
|
---|
| 249 | logger.info("NEXT: "+thisFilename);
|
---|
| 250 |
|
---|
| 251 | if (! Files.exists(thisFilename) || ! Files.canRead(thisFilename) )
|
---|
| 252 | {
|
---|
| 253 | if (! Files.exists(thisFilename))
|
---|
| 254 | logger.warn("File doesn't exist: "+thisFilename);
|
---|
| 255 | else if (! Files.canRead(thisFilename) )
|
---|
| 256 | logger.warn("File cannot be read: "+thisFilename);
|
---|
| 257 | rtr = nextDocument();
|
---|
| 258 | }
|
---|
| 259 | else if (Files.isDirectory(thisFilename))
|
---|
| 260 | {
|
---|
| 261 | //we're allowed to recurse into directories
|
---|
| 262 | if(Recurse)
|
---|
| 263 | addDirectoryListing();
|
---|
| 264 | }
|
---|
| 265 | else
|
---|
| 266 | { //this file is fine - use it!
|
---|
| 267 | //this block ensures that DocId is only increased once per file
|
---|
| 268 | Docid++;
|
---|
| 269 | rtr = true;
|
---|
| 270 | }
|
---|
| 271 | }//loop ends
|
---|
| 272 | return rtr;
|
---|
| 273 | }
|
---|
| 274 | /**
|
---|
| 275 | * Return the current document in the collection.
|
---|
| 276 | * @return Document the next document object from the collection.
|
---|
| 277 | */
|
---|
| 278 | public Document getDocument()
|
---|
| 279 | {
|
---|
| 280 | InputStream in = null;
|
---|
| 281 | if (currentStream != null)
|
---|
| 282 | {
|
---|
| 283 | try{
|
---|
| 284 | currentStream.close();
|
---|
| 285 | currentStream = null;
|
---|
| 286 | }catch (IOException ioe) {
|
---|
| 287 | logger.warn("IOException while closing file being read", ioe);
|
---|
| 288 | }
|
---|
| 289 | }
|
---|
| 290 | if (thisFilename == null)
|
---|
| 291 | {
|
---|
| 292 | return null;
|
---|
| 293 | }
|
---|
| 294 | String filename = null;
|
---|
| 295 | try{
|
---|
| 296 | in = Files.openFileStream(thisFilename);
|
---|
| 297 | filename = thisFilename.replaceAll("\\.gz$","");
|
---|
| 298 | }catch(IOException ioe){
|
---|
| 299 | logger.warn("Problem reading "+thisFilename+" in "+
|
---|
| 300 | "SimpleFileCollection.getDocuent() : ",ioe);
|
---|
| 301 | }
|
---|
| 302 | currentStream = in;
|
---|
| 303 | return makeDocument(filename, in);
|
---|
| 304 |
|
---|
| 305 | }
|
---|
| 306 |
|
---|
| 307 |
|
---|
| 308 | /** Given the opened document in, of Filename and File f, work out which
|
---|
| 309 | * parser to try, and instantiate it. If you wish to use a different
|
---|
| 310 | * constructor for opening documents, then you need to subclass this method.
|
---|
| 311 | * @param Filename the filename of the currently open document
|
---|
| 312 | * @param in The stream of the currently open document
|
---|
| 313 | * @return Document object to parse the document, or null if no suitable parser
|
---|
| 314 | * exists.*/
|
---|
| 315 | protected Document makeDocument(String Filename, InputStream in)
|
---|
| 316 | {
|
---|
| 317 | if (Filename == null || in == null)
|
---|
| 318 | return null;
|
---|
| 319 | String[] splitStr = Filename.split("\\.");
|
---|
| 320 | String ext = splitStr[splitStr.length-1].toLowerCase();
|
---|
| 321 | Class<? extends Document> reader = extension_DocumentClass.get(ext);
|
---|
| 322 | Document rtr = null;
|
---|
| 323 |
|
---|
| 324 | /*If a document doesn't have an associated parser,
|
---|
| 325 | check the default one */
|
---|
| 326 | if (reader == null) {
|
---|
| 327 | reader = extension_DocumentClass.get("|DEFAULT|");
|
---|
| 328 | }
|
---|
| 329 | /*if there is no default parser, then tough luck for that file,
|
---|
| 330 | but it's ignored */
|
---|
| 331 | if (reader == null) {
|
---|
| 332 | logger.warn("No available parser for file " + Filename + ", file is ignored.");
|
---|
| 333 | return null;
|
---|
| 334 | }
|
---|
| 335 | logger.debug("Using "+reader.getName() + " to read "+ Filename);
|
---|
| 336 |
|
---|
| 337 | /* now attempt to instantiate the class */
|
---|
| 338 | try{
|
---|
| 339 | Map<String,String> docProperties = new HashMap<String,String>(5);
|
---|
| 340 | // [jmt12] I need the Docid in the Document instance
|
---|
| 341 | docProperties.put("docno", this.getDocid());
|
---|
| 342 | docProperties.put("filename", Filename);
|
---|
| 343 | //and instantiate
|
---|
| 344 | rtr = reader.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(in, docProperties, tokeniser);
|
---|
| 345 | indexedFiles.add(thisFilename);
|
---|
| 346 | }catch (OutOfMemoryError e){
|
---|
| 347 | logger.warn("Problem instantiating a document class; Out of memory error occured: ",e);
|
---|
| 348 | System.gc();
|
---|
| 349 | }catch (StackOverflowError e){
|
---|
| 350 | logger.warn("Problem instantiating a document class; Stack Overflow error occured: ",e);
|
---|
| 351 | }catch (Exception e){
|
---|
| 352 | logger.warn("Problem instantiating a document class: ",e);
|
---|
| 353 | }
|
---|
| 354 | rtr.getAllProperties().put("docno", this.getDocid());
|
---|
| 355 | return rtr;
|
---|
| 356 | }
|
---|
| 357 |
|
---|
| 358 | /**
|
---|
| 359 | * Checks whether there are more documents in the colection.
|
---|
| 360 | * @return boolean true if there are no more documents in the collection,
|
---|
| 361 | * otherwise it returns false.
|
---|
| 362 | */
|
---|
| 363 | public boolean endOfCollection()
|
---|
| 364 | {
|
---|
| 365 | return (FileList.size() == 0);
|
---|
| 366 | }
|
---|
| 367 |
|
---|
| 368 | /**
|
---|
| 369 | * Starts again from the beginning of the collection.
|
---|
| 370 | */
|
---|
| 371 | public void reset()
|
---|
| 372 | {
|
---|
| 373 | Docid = 0;
|
---|
| 374 | FileList = new LinkedList<String>(firstList);
|
---|
| 375 | indexedFiles = new ArrayList<String>();
|
---|
| 376 | }
|
---|
| 377 |
|
---|
| 378 | /**
|
---|
| 379 | * Returns the current document's identifier string.
|
---|
| 380 | * @return String the identifier of the current document.
|
---|
| 381 | */
|
---|
| 382 | public String getDocid()
|
---|
| 383 | {
|
---|
| 384 | return Docid+"";
|
---|
| 385 | }
|
---|
| 386 |
|
---|
| 387 | @Override
|
---|
| 388 | public void close()
|
---|
| 389 | {
|
---|
| 390 | if (currentStream != null)
|
---|
| 391 | {
|
---|
| 392 | try{
|
---|
| 393 | currentStream.close();
|
---|
| 394 | } catch (IOException ioe) {
|
---|
| 395 | logger.error("Exception occured while trying to close an IO stream",ioe);
|
---|
| 396 | }
|
---|
| 397 | }
|
---|
| 398 | }
|
---|
| 399 |
|
---|
| 400 | /** Returns the ist of indexed files in the order they were indexed in. */
|
---|
| 401 | public List<String> getFileList()
|
---|
| 402 | {
|
---|
| 403 | return indexedFiles;
|
---|
| 404 | }
|
---|
| 405 |
|
---|
| 406 | /** Called when <tt>thisFile</tt> is identified as a directory, this adds the entire
|
---|
| 407 | * contents of the directory onto the list to be processed. */
|
---|
| 408 | protected void addDirectoryListing()
|
---|
| 409 | {
|
---|
| 410 | //File[] contents = thisFile.listFiles();
|
---|
| 411 | String[] dirContents = Files.list( thisFilename );
|
---|
| 412 | if (dirContents == null)
|
---|
| 413 | return;
|
---|
| 414 | for(String e : dirContents)
|
---|
| 415 | {
|
---|
| 416 | if (e.equals(".") || e.equals(".."))
|
---|
| 417 | continue;
|
---|
| 418 | FileList.add(thisFilename + ApplicationSetup.FILE_SEPARATOR + e);
|
---|
| 419 | }
|
---|
| 420 | /*for(int i=0;i<contents.length;i++)
|
---|
| 421 | {
|
---|
| 422 | FileList.add(contents[i].getAbsolutePath());
|
---|
| 423 | }*/
|
---|
| 424 | }
|
---|
| 425 |
|
---|
| 426 | /**
|
---|
| 427 | * Simple test case. Pass the filename of a file that lists files
|
---|
| 428 | * to be processed to this test case.
|
---|
| 429 | */
|
---|
| 430 | public static void main(String[] args) {
|
---|
| 431 | Indexer in = new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
|
---|
| 432 | in.createDirectIndex(new Collection[] {new SimpleFileCollection(args[0])});
|
---|
| 433 | in.createInvertedIndex();
|
---|
| 434 | }
|
---|
| 435 |
|
---|
| 436 | }
|
---|