1 | /*
|
---|
2 | * Terrier - Terabyte Retriever
|
---|
3 | * Webpage: http://terrier.org
|
---|
4 | * Contact: terrier{a.}dcs.gla.ac.uk
|
---|
5 | * University of Glasgow - School of Computing Science
|
---|
6 | * http://www.gla.ac.uk/
|
---|
7 | *
|
---|
8 | * The contents of this file are subject to the Mozilla Public License
|
---|
9 | * Version 1.1 (the "License"); you may not use this file except in
|
---|
10 | * compliance with the License. You may obtain a copy of the License at
|
---|
11 | * http://www.mozilla.org/MPL/
|
---|
12 | *
|
---|
13 | * Software distributed under the License is distributed on an "AS IS"
|
---|
14 | * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
|
---|
15 | * the License for the specific language governing rights and limitations
|
---|
16 | * under the License.
|
---|
17 | *
|
---|
18 | * The Original Code is SimpleFileCollection.java.
|
---|
19 | *
|
---|
20 | * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
|
---|
21 | * All Rights Reserved.
|
---|
22 | *
|
---|
23 | * Contributor(s):
|
---|
24 | * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
|
---|
25 | * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
|
---|
26 | */
|
---|
27 | package org.terrier.indexing;
|
---|
28 | import java.io.BufferedReader;
|
---|
29 | import java.io.IOException;
|
---|
30 | import java.io.InputStream;
|
---|
31 | import java.util.ArrayList;
|
---|
32 | import java.util.HashMap;
|
---|
33 | import java.util.LinkedList;
|
---|
34 | import java.util.List;
|
---|
35 | import java.util.Map;
|
---|
36 |
|
---|
37 | import org.apache.log4j.Logger;
|
---|
38 | import org.terrier.indexing.tokenisation.Tokeniser;
|
---|
39 | import org.terrier.utility.ApplicationSetup;
|
---|
40 | import org.terrier.utility.Files;
|
---|
41 | /**
|
---|
42 | * Implements a collection that can read arbitrary files on disk. It will
|
---|
43 | * use the file list given to it in the constructor, or it will read the
|
---|
44 | * file specified by the property <tt>collection.spec</tt>.
|
---|
45 | * <b>Properties:</b>
|
---|
46 | * <ul>
|
---|
47 | * <li><tt>indexing.simplefilecollection.extensionsparsers</tt> - a comma delimited lists of tuples, in the form "extension:DocumentClass".
|
---|
48 | * For instance, one tuple could be "txt:FileDocument". The default <tt>txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument,pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument</tt>.
|
---|
49 | * </li>
|
---|
50 | * <li><tt>indexing.simplefilecollection.defaultparser</tt> - the default parser for any unknown extensions.
|
---|
51 | * If this property is empty, then such documents will not be opened.</li>
|
---|
52 | * <li><tt>indexing.simplefilecollection.recurse</tt> - whether directories should be opened looking for
|
---|
53 | * files.</li>
|
---|
54 | * </ul>
|
---|
55 | * @author Craig Macdonald & Vassilis Plachouras
|
---|
56 | */
|
---|
57 | public class SimpleFileCollection implements Collection/*, DocumentExtractor*/
|
---|
58 | {
|
---|
59 | protected static final Logger logger = Logger.getLogger(SimpleFileCollection.class);
|
---|
60 | /** The default namespace for all parsers to be loaded from. Only used if
|
---|
61 | * the class name specified does not contain any periods ('.') */
|
---|
62 | public final static String NAMESPACE_DOCUMENTS = "org.terrier.indexing.";
|
---|
63 |
|
---|
64 | /** The list of files to index.*/
|
---|
65 | protected LinkedList<String> FileList = new LinkedList<String>();
|
---|
66 |
|
---|
67 | /** Contains the list of files first handed to the SimpleFileCollection, allowing
|
---|
68 | * the SimpleFileCollection instance to be simply reset. */
|
---|
69 | protected List<String> firstList;
|
---|
70 |
|
---|
71 | /** This is filled during traversal, so document IDs can be matched with filenames */
|
---|
72 | protected List<String> indexedFiles = new ArrayList<String>();
|
---|
73 |
|
---|
74 | /** The identifier of a document in the collection.*/
|
---|
75 | protected int Docid = 0;
|
---|
76 |
|
---|
77 | /** Whether directories should be recursed into by this class */
|
---|
78 | protected boolean Recurse = Boolean.parseBoolean(ApplicationSetup.getProperty("indexing.simplefilecollection.recurse", "true"));
|
---|
79 |
|
---|
80 | /** Maps filename extensions to Document classes.
|
---|
81 | * The entry |DEFAULT| maps to the default document parser, specified
|
---|
82 | * by <tt>indexing.simplefilecollection.defaultparser</tt> */
|
---|
83 | protected Map<String,Class<? extends Document>> extension_DocumentClass = new HashMap<String,Class<? extends Document>>();
|
---|
84 |
|
---|
85 | /** The filename of the current file we are processing. */
|
---|
86 | protected String thisFilename;
|
---|
87 |
|
---|
88 | /** The InputStream of the most recently opened document. This
|
---|
89 | * is used to ensure that files are closed once they have been
|
---|
90 | * finished reading. */
|
---|
91 | protected InputStream currentStream = null;
|
---|
92 |
|
---|
93 | protected Tokeniser tokeniser = Tokeniser.getTokeniser();
|
---|
94 |
|
---|
95 | /**
|
---|
96 | * Constructs an instance of the class with the given list of files.
|
---|
97 | * @param filelist ArrayList the files to be processed by this collection.
|
---|
98 | */
|
---|
99 | public SimpleFileCollection(List<String> filelist, boolean recurse) {
|
---|
100 | FileList = new LinkedList<String>(filelist);
|
---|
101 | //keep a backup copy for reset()
|
---|
102 | firstList = new LinkedList<String>(filelist);
|
---|
103 | createExtensionDocumentMapping();
|
---|
104 | }
|
---|
105 |
|
---|
106 | /**
|
---|
107 | * A default constructor that uses the files to be processed
|
---|
108 | * by this collection, as specified by the property
|
---|
109 | * <tt>collection.spec</tt>
|
---|
110 | */
|
---|
111 | public SimpleFileCollection()
|
---|
112 | {
|
---|
113 | this(ApplicationSetup.COLLECTION_SPEC);
|
---|
114 | }
|
---|
115 |
|
---|
116 |
|
---|
117 | /**
|
---|
118 | * Creates an instance of the class. The files to be processed are
|
---|
119 | * specified in the file with the given name.
|
---|
120 | * @param addressCollectionFilename String the name of the file that
|
---|
121 | * contains the list of files to be processed by this collecion.
|
---|
122 | */
|
---|
123 | public SimpleFileCollection(String addressCollectionFilename)
|
---|
124 | {
|
---|
125 | ArrayList<String> generatedFileList = new ArrayList<String>();
|
---|
126 | try{
|
---|
127 | //opening the address_collection file
|
---|
128 | BufferedReader br = Files.openFileReader(addressCollectionFilename);
|
---|
129 | //iterate through each entry of the address_collection file
|
---|
130 | String filename = br.readLine();
|
---|
131 | while (filename != null) {
|
---|
132 | //if the line starts with #, then assume it is
|
---|
133 | //a comment and proceed to the next one
|
---|
134 | if (filename.startsWith("#")) {
|
---|
135 | filename = br.readLine();
|
---|
136 | continue;
|
---|
137 | }
|
---|
138 | if(logger.isDebugEnabled()){
|
---|
139 | logger.debug("Added "+filename+" to filelist for SimpleFileCollection");
|
---|
140 | }
|
---|
141 | generatedFileList.add(filename);
|
---|
142 | filename = br.readLine();
|
---|
143 | }
|
---|
144 |
|
---|
145 | }catch(IOException ioe) {
|
---|
146 | logger.error("problem opening address list of files in SimpleFileCollectio: ",ioe);
|
---|
147 | }
|
---|
148 | FileList = new LinkedList<String>(generatedFileList);
|
---|
149 | firstList = new LinkedList<String>(generatedFileList);
|
---|
150 | createExtensionDocumentMapping();
|
---|
151 | }
|
---|
152 |
|
---|
153 |
|
---|
154 | /** Parses the properties <tt>indexing.simplefilecollection.extensionsparsers</tt>
|
---|
155 | * and <tt>indexing.simplefilecollection.defaultparser</tt> and attempts to load
|
---|
156 | * all the mentioned classes, in a hashtable mapping filename extension to their
|
---|
157 | * respective parsers. If <tt>indexing.simplefilecollection.defaultparser</tt>
|
---|
158 | * is set, then that class will be used to attempt to parse documents that no
|
---|
159 | * explicit parser is set. */
|
---|
160 | protected void createExtensionDocumentMapping()
|
---|
161 | {
|
---|
162 | String staticMappings = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers",
|
---|
163 | "txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument," +
|
---|
164 | "pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,"+
|
---|
165 | "doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument");
|
---|
166 | String defaultMapping = ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","");
|
---|
167 | if (staticMappings.length() > 0)
|
---|
168 | {
|
---|
169 | String[] mappings = staticMappings.split("\\s*,\\s*");
|
---|
170 | for(int i=0;i<mappings.length;i++)
|
---|
171 | {
|
---|
172 | if (mappings[i].indexOf(":") < 1)
|
---|
173 | continue;
|
---|
174 | String[] mapping = mappings[i].split(":");
|
---|
175 | if (mapping.length == 2 && mapping[0].length() > 0
|
---|
176 | && mapping[1].length() > 0)
|
---|
177 | {
|
---|
178 | if (mapping[1].indexOf(".") == -1)
|
---|
179 | mapping[1] = NAMESPACE_DOCUMENTS + mapping[1];
|
---|
180 | else if (mapping[1].startsWith("uk.ac.gla.terrier"))
|
---|
181 | mapping[1] = mapping[1].replaceAll("uk.ac.gla.terrier", "org.terrier");
|
---|
182 | try{
|
---|
183 | Class<? extends Document> d = Class.forName(mapping[1], false, this.getClass().getClassLoader()).asSubclass(Document.class);
|
---|
184 | extension_DocumentClass.put(mapping[0].toLowerCase(), d);
|
---|
185 | }catch (Exception e){
|
---|
186 | /*warning, just ignore */
|
---|
187 | logger.warn("Missing class " + mapping[1] + " for " +
|
---|
188 | mapping[0].toLowerCase() + " files.",e);
|
---|
189 | }
|
---|
190 | }
|
---|
191 | }
|
---|
192 | }
|
---|
193 | //set the mapping for the default parser
|
---|
194 | if (!defaultMapping.equals("")) {
|
---|
195 | if (defaultMapping.indexOf(".") == -1)
|
---|
196 | defaultMapping = NAMESPACE_DOCUMENTS + defaultMapping;
|
---|
197 | else if (defaultMapping.startsWith("uk.ac.gla.terrier"))
|
---|
198 | defaultMapping = defaultMapping.replaceAll("uk.ac.gla.terrier", "org.terrier");
|
---|
199 |
|
---|
200 | try{
|
---|
201 | Class<? extends Document> d = Class.forName(defaultMapping, false, this.getClass().getClassLoader()).asSubclass(Document.class);
|
---|
202 | extension_DocumentClass.put("|DEFAULT|", d);
|
---|
203 | }catch (Exception e){
|
---|
204 | logger.warn("Missing default class " + defaultMapping, e);
|
---|
205 | }
|
---|
206 | }
|
---|
207 | }
|
---|
208 |
|
---|
209 | /**
|
---|
210 | * Check whether there is a next document in the collection to be processed
|
---|
211 | * @return has next
|
---|
212 | */
|
---|
213 | public boolean hasNext() {
|
---|
214 | return ! endOfCollection();
|
---|
215 | }
|
---|
216 |
|
---|
217 | /**
|
---|
218 | * Move onto the next document in the collection to be processed.
|
---|
219 | * @return next document
|
---|
220 | */
|
---|
221 | public Document next()
|
---|
222 | {
|
---|
223 | nextDocument();
|
---|
224 | return getDocument();
|
---|
225 | }
|
---|
226 |
|
---|
227 | /**
|
---|
228 | * This is unsupported by this Collection implementation, and
|
---|
229 | * any calls will throw UnsupportedOperationException
|
---|
230 | * Throws UnsupportedOperationException on all invocations */
|
---|
231 | public void remove()
|
---|
232 | {
|
---|
233 | throw new UnsupportedOperationException("Iterator.remove() not supported");
|
---|
234 | }
|
---|
235 |
|
---|
236 | /**
|
---|
237 | * Move onto the next document in the collection to be processed.
|
---|
238 | * @return boolean true if there are more documents in the collection,
|
---|
239 | * otherwise return false.*/
|
---|
240 | public boolean nextDocument()
|
---|
241 | {
|
---|
242 | if (FileList.size() == 0)
|
---|
243 | return false;
|
---|
244 | boolean rtr = false;
|
---|
245 | thisFilename = null;
|
---|
246 | while(FileList.size() > 0 && ! rtr)
|
---|
247 | {
|
---|
248 | thisFilename = FileList.removeFirst();
|
---|
249 | logger.info("NEXT: "+thisFilename);
|
---|
250 |
|
---|
251 | if (! Files.exists(thisFilename) || ! Files.canRead(thisFilename) )
|
---|
252 | {
|
---|
253 | if (! Files.exists(thisFilename))
|
---|
254 | logger.warn("File doesn't exist: "+thisFilename);
|
---|
255 | else if (! Files.canRead(thisFilename) )
|
---|
256 | logger.warn("File cannot be read: "+thisFilename);
|
---|
257 | rtr = nextDocument();
|
---|
258 | }
|
---|
259 | else if (Files.isDirectory(thisFilename))
|
---|
260 | {
|
---|
261 | //we're allowed to recurse into directories
|
---|
262 | if(Recurse)
|
---|
263 | addDirectoryListing();
|
---|
264 | }
|
---|
265 | else
|
---|
266 | { //this file is fine - use it!
|
---|
267 | //this block ensures that DocId is only increased once per file
|
---|
268 | Docid++;
|
---|
269 | rtr = true;
|
---|
270 | }
|
---|
271 | }//loop ends
|
---|
272 | return rtr;
|
---|
273 | }
|
---|
274 | /**
|
---|
275 | * Return the current document in the collection.
|
---|
276 | * @return Document the next document object from the collection.
|
---|
277 | */
|
---|
278 | public Document getDocument()
|
---|
279 | {
|
---|
280 | InputStream in = null;
|
---|
281 | if (currentStream != null)
|
---|
282 | {
|
---|
283 | try{
|
---|
284 | currentStream.close();
|
---|
285 | currentStream = null;
|
---|
286 | }catch (IOException ioe) {
|
---|
287 | logger.warn("IOException while closing file being read", ioe);
|
---|
288 | }
|
---|
289 | }
|
---|
290 | if (thisFilename == null)
|
---|
291 | {
|
---|
292 | return null;
|
---|
293 | }
|
---|
294 | String filename = null;
|
---|
295 | try{
|
---|
296 | in = Files.openFileStream(thisFilename);
|
---|
297 | filename = thisFilename.replaceAll("\\.gz$","");
|
---|
298 | }catch(IOException ioe){
|
---|
299 | logger.warn("Problem reading "+thisFilename+" in "+
|
---|
300 | "SimpleFileCollection.getDocuent() : ",ioe);
|
---|
301 | }
|
---|
302 | currentStream = in;
|
---|
303 | return makeDocument(filename, in);
|
---|
304 |
|
---|
305 | }
|
---|
306 |
|
---|
307 |
|
---|
308 | /** Given the opened document in, of Filename and File f, work out which
|
---|
309 | * parser to try, and instantiate it. If you wish to use a different
|
---|
310 | * constructor for opening documents, then you need to subclass this method.
|
---|
311 | * @param Filename the filename of the currently open document
|
---|
312 | * @param in The stream of the currently open document
|
---|
313 | * @return Document object to parse the document, or null if no suitable parser
|
---|
314 | * exists.*/
|
---|
315 | protected Document makeDocument(String Filename, InputStream in)
|
---|
316 | {
|
---|
317 | if (Filename == null || in == null)
|
---|
318 | return null;
|
---|
319 | String[] splitStr = Filename.split("\\.");
|
---|
320 | String ext = splitStr[splitStr.length-1].toLowerCase();
|
---|
321 | Class<? extends Document> reader = extension_DocumentClass.get(ext);
|
---|
322 | Document rtr = null;
|
---|
323 |
|
---|
324 | /*If a document doesn't have an associated parser,
|
---|
325 | check the default one */
|
---|
326 | if (reader == null) {
|
---|
327 | reader = extension_DocumentClass.get("|DEFAULT|");
|
---|
328 | }
|
---|
329 | /*if there is no default parser, then tough luck for that file,
|
---|
330 | but it's ignored */
|
---|
331 | if (reader == null) {
|
---|
332 | logger.warn("No available parser for file " + Filename + ", file is ignored.");
|
---|
333 | return null;
|
---|
334 | }
|
---|
335 | logger.debug("Using "+reader.getName() + " to read "+ Filename);
|
---|
336 |
|
---|
337 | /* now attempt to instantiate the class */
|
---|
338 | try{
|
---|
339 | Map<String,String> docProperties = new HashMap<String,String>(5);
|
---|
340 | // [jmt12] I need the Docid in the Document instance
|
---|
341 | docProperties.put("docno", this.getDocid());
|
---|
342 | docProperties.put("filename", Filename);
|
---|
343 | //and instantiate
|
---|
344 | rtr = reader.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(in, docProperties, tokeniser);
|
---|
345 | indexedFiles.add(thisFilename);
|
---|
346 | }catch (OutOfMemoryError e){
|
---|
347 | logger.warn("Problem instantiating a document class; Out of memory error occured: ",e);
|
---|
348 | System.gc();
|
---|
349 | }catch (StackOverflowError e){
|
---|
350 | logger.warn("Problem instantiating a document class; Stack Overflow error occured: ",e);
|
---|
351 | }catch (Exception e){
|
---|
352 | logger.warn("Problem instantiating a document class: ",e);
|
---|
353 | }
|
---|
354 | rtr.getAllProperties().put("docno", this.getDocid());
|
---|
355 | return rtr;
|
---|
356 | }
|
---|
357 |
|
---|
358 | /**
|
---|
359 | * Checks whether there are more documents in the colection.
|
---|
360 | * @return boolean true if there are no more documents in the collection,
|
---|
361 | * otherwise it returns false.
|
---|
362 | */
|
---|
363 | public boolean endOfCollection()
|
---|
364 | {
|
---|
365 | return (FileList.size() == 0);
|
---|
366 | }
|
---|
367 |
|
---|
368 | /**
|
---|
369 | * Starts again from the beginning of the collection.
|
---|
370 | */
|
---|
371 | public void reset()
|
---|
372 | {
|
---|
373 | Docid = 0;
|
---|
374 | FileList = new LinkedList<String>(firstList);
|
---|
375 | indexedFiles = new ArrayList<String>();
|
---|
376 | }
|
---|
377 |
|
---|
378 | /**
|
---|
379 | * Returns the current document's identifier string.
|
---|
380 | * @return String the identifier of the current document.
|
---|
381 | */
|
---|
382 | public String getDocid()
|
---|
383 | {
|
---|
384 | return Docid+"";
|
---|
385 | }
|
---|
386 |
|
---|
387 | @Override
|
---|
388 | public void close()
|
---|
389 | {
|
---|
390 | if (currentStream != null)
|
---|
391 | {
|
---|
392 | try{
|
---|
393 | currentStream.close();
|
---|
394 | } catch (IOException ioe) {
|
---|
395 | logger.error("Exception occured while trying to close an IO stream",ioe);
|
---|
396 | }
|
---|
397 | }
|
---|
398 | }
|
---|
399 |
|
---|
400 | /** Returns the ist of indexed files in the order they were indexed in. */
|
---|
401 | public List<String> getFileList()
|
---|
402 | {
|
---|
403 | return indexedFiles;
|
---|
404 | }
|
---|
405 |
|
---|
406 | /** Called when <tt>thisFile</tt> is identified as a directory, this adds the entire
|
---|
407 | * contents of the directory onto the list to be processed. */
|
---|
408 | protected void addDirectoryListing()
|
---|
409 | {
|
---|
410 | //File[] contents = thisFile.listFiles();
|
---|
411 | String[] dirContents = Files.list( thisFilename );
|
---|
412 | if (dirContents == null)
|
---|
413 | return;
|
---|
414 | for(String e : dirContents)
|
---|
415 | {
|
---|
416 | if (e.equals(".") || e.equals(".."))
|
---|
417 | continue;
|
---|
418 | FileList.add(thisFilename + ApplicationSetup.FILE_SEPARATOR + e);
|
---|
419 | }
|
---|
420 | /*for(int i=0;i<contents.length;i++)
|
---|
421 | {
|
---|
422 | FileList.add(contents[i].getAbsolutePath());
|
---|
423 | }*/
|
---|
424 | }
|
---|
425 |
|
---|
426 | /**
|
---|
427 | * Simple test case. Pass the filename of a file that lists files
|
---|
428 | * to be processed to this test case.
|
---|
429 | */
|
---|
430 | public static void main(String[] args) {
|
---|
431 | Indexer in = new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
|
---|
432 | in.createDirectIndex(new Collection[] {new SimpleFileCollection(args[0])});
|
---|
433 | in.createInvertedIndex();
|
---|
434 | }
|
---|
435 |
|
---|
436 | }
|
---|