Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/SimpleFileCollection.java@ 26209

Last change on this file since 26209 was 26209, checked in by jmt12, 12 years ago
Gah - forgot I'd customized the SimpleFileCollection to ensure docno is available in the initial properties of the document (before it is indexed)
File size: 14.9 KB

Line
1	/*
2	* Terrier - Terabyte Retriever
3	* Webpage: http://terrier.org
4	* Contact: terrier{a.}dcs.gla.ac.uk
5	* University of Glasgow - School of Computing Science
6	* http://www.gla.ac.uk/
7	*
8	* The contents of this file are subject to the Mozilla Public License
9	* Version 1.1 (the "License"); you may not use this file except in
10	* compliance with the License. You may obtain a copy of the License at
11	* http://www.mozilla.org/MPL/
12	*
13	* Software distributed under the License is distributed on an "AS IS"
14	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
15	* the License for the specific language governing rights and limitations
16	* under the License.
17	*
18	* The Original Code is SimpleFileCollection.java.
19	*
20	* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
21	* All Rights Reserved.
22	*
23	* Contributor(s):
24	* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
25	* Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
26	*/
27	package org.terrier.indexing;
28	import java.io.BufferedReader;
29	import java.io.IOException;
30	import java.io.InputStream;
31	import java.util.ArrayList;
32	import java.util.HashMap;
33	import java.util.LinkedList;
34	import java.util.List;
35	import java.util.Map;
36
37	import org.apache.log4j.Logger;
38	import org.terrier.indexing.tokenisation.Tokeniser;
39	import org.terrier.utility.ApplicationSetup;
40	import org.terrier.utility.Files;
41	/**
42	* Implements a collection that can read arbitrary files on disk. It will
43	* use the file list given to it in the constructor, or it will read the
44	* file specified by the property <tt>collection.spec</tt>.
45	* <b>Properties:</b>
46	* <ul>
47	* <li><tt>indexing.simplefilecollection.extensionsparsers</tt> - a comma delimited lists of tuples, in the form "extension:DocumentClass".
48	* For instance, one tuple could be "txt:FileDocument". The default <tt>txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument,pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument</tt>.
49	* </li>
50	* <li><tt>indexing.simplefilecollection.defaultparser</tt> - the default parser for any unknown extensions.
51	* If this property is empty, then such documents will not be opened.</li>
52	* <li><tt>indexing.simplefilecollection.recurse</tt> - whether directories should be opened looking for
53	* files.</li>
54	* </ul>
55	* @author Craig Macdonald & Vassilis Plachouras
56	*/
57	public class SimpleFileCollection implements Collection/, DocumentExtractor/
58	{
59	protected static final Logger logger = Logger.getLogger(SimpleFileCollection.class);
60	/** The default namespace for all parsers to be loaded from. Only used if
61	* the class name specified does not contain any periods ('.') */
62	public final static String NAMESPACE_DOCUMENTS = "org.terrier.indexing.";
63
64	/** The list of files to index.*/
65	protected LinkedList<String> FileList = new LinkedList<String>();
66
67	/** Contains the list of files first handed to the SimpleFileCollection, allowing
68	* the SimpleFileCollection instance to be simply reset. */
69	protected List<String> firstList;
70
71	/** This is filled during traversal, so document IDs can be matched with filenames */
72	protected List<String> indexedFiles = new ArrayList<String>();
73
74	/** The identifier of a document in the collection.*/
75	protected int Docid = 0;
76
77	/** Whether directories should be recursed into by this class */
78	protected boolean Recurse = Boolean.parseBoolean(ApplicationSetup.getProperty("indexing.simplefilecollection.recurse", "true"));
79
80	/** Maps filename extensions to Document classes.
81	* The entry \|DEFAULT\| maps to the default document parser, specified
82	* by <tt>indexing.simplefilecollection.defaultparser</tt> */
83	protected Map<String,Class<? extends Document>> extension_DocumentClass = new HashMap<String,Class<? extends Document>>();
84
85	/** The filename of the current file we are processing. */
86	protected String thisFilename;
87
88	/** The InputStream of the most recently opened document. This
89	* is used to ensure that files are closed once they have been
90	* finished reading. */
91	protected InputStream currentStream = null;
92
93	protected Tokeniser tokeniser = Tokeniser.getTokeniser();
94
95	/**
96	* Constructs an instance of the class with the given list of files.
97	* @param filelist ArrayList the files to be processed by this collection.
98	*/
99	public SimpleFileCollection(List<String> filelist, boolean recurse) {
100	FileList = new LinkedList<String>(filelist);
101	//keep a backup copy for reset()
102	firstList = new LinkedList<String>(filelist);
103	createExtensionDocumentMapping();
104	}
105
106	/**
107	* A default constructor that uses the files to be processed
108	* by this collection, as specified by the property
109	* <tt>collection.spec</tt>
110	*/
111	public SimpleFileCollection()
112	{
113	this(ApplicationSetup.COLLECTION_SPEC);
114	}
115
116
117	/**
118	* Creates an instance of the class. The files to be processed are
119	* specified in the file with the given name.
120	* @param addressCollectionFilename String the name of the file that
121	* contains the list of files to be processed by this collecion.
122	*/
123	public SimpleFileCollection(String addressCollectionFilename)
124	{
125	ArrayList<String> generatedFileList = new ArrayList<String>();
126	try{
127	//opening the address_collection file
128	BufferedReader br = Files.openFileReader(addressCollectionFilename);
129	//iterate through each entry of the address_collection file
130	String filename = br.readLine();
131	while (filename != null) {
132	//if the line starts with #, then assume it is
133	//a comment and proceed to the next one
134	if (filename.startsWith("#")) {
135	filename = br.readLine();
136	continue;
137	}
138	if(logger.isDebugEnabled()){
139	logger.debug("Added "+filename+" to filelist for SimpleFileCollection");
140	}
141	generatedFileList.add(filename);
142	filename = br.readLine();
143	}
144
145	}catch(IOException ioe) {
146	logger.error("problem opening address list of files in SimpleFileCollectio: ",ioe);
147	}
148	FileList = new LinkedList<String>(generatedFileList);
149	firstList = new LinkedList<String>(generatedFileList);
150	createExtensionDocumentMapping();
151	}
152
153
154	/** Parses the properties <tt>indexing.simplefilecollection.extensionsparsers</tt>
155	* and <tt>indexing.simplefilecollection.defaultparser</tt> and attempts to load
156	* all the mentioned classes, in a hashtable mapping filename extension to their
157	* respective parsers. If <tt>indexing.simplefilecollection.defaultparser</tt>
158	* is set, then that class will be used to attempt to parse documents that no
159	* explicit parser is set. */
160	protected void createExtensionDocumentMapping()
161	{
162	String staticMappings = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers",
163	"txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument," +
164	"pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,"+
165	"doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument");
166	String defaultMapping = ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","");
167	if (staticMappings.length() > 0)
168	{
169	String[] mappings = staticMappings.split("\\s,\\s");
170	for(int i=0;i<mappings.length;i++)
171	{
172	if (mappings[i].indexOf(":") < 1)
173	continue;
174	String[] mapping = mappings[i].split(":");
175	if (mapping.length == 2 && mapping[0].length() > 0
176	&& mapping[1].length() > 0)
177	{
178	if (mapping[1].indexOf(".") == -1)
179	mapping[1] = NAMESPACE_DOCUMENTS + mapping[1];
180	else if (mapping[1].startsWith("uk.ac.gla.terrier"))
181	mapping[1] = mapping[1].replaceAll("uk.ac.gla.terrier", "org.terrier");
182	try{
183	Class<? extends Document> d = Class.forName(mapping[1], false, this.getClass().getClassLoader()).asSubclass(Document.class);
184	extension_DocumentClass.put(mapping[0].toLowerCase(), d);
185	}catch (Exception e){
186	/warning, just ignore /
187	logger.warn("Missing class " + mapping[1] + " for " +
188	mapping[0].toLowerCase() + " files.",e);
189	}
190	}
191	}
192	}
193	//set the mapping for the default parser
194	if (!defaultMapping.equals("")) {
195	if (defaultMapping.indexOf(".") == -1)
196	defaultMapping = NAMESPACE_DOCUMENTS + defaultMapping;
197	else if (defaultMapping.startsWith("uk.ac.gla.terrier"))
198	defaultMapping = defaultMapping.replaceAll("uk.ac.gla.terrier", "org.terrier");
199
200	try{
201	Class<? extends Document> d = Class.forName(defaultMapping, false, this.getClass().getClassLoader()).asSubclass(Document.class);
202	extension_DocumentClass.put("\|DEFAULT\|", d);
203	}catch (Exception e){
204	logger.warn("Missing default class " + defaultMapping, e);
205	}
206	}
207	}
208
209	/**
210	* Check whether there is a next document in the collection to be processed
211	* @return has next
212	*/
213	public boolean hasNext() {
214	return ! endOfCollection();
215	}
216
217	/**
218	* Move onto the next document in the collection to be processed.
219	* @return next document
220	*/
221	public Document next()
222	{
223	nextDocument();
224	return getDocument();
225	}
226
227	/**
228	* This is unsupported by this Collection implementation, and
229	* any calls will throw UnsupportedOperationException
230	* Throws UnsupportedOperationException on all invocations */
231	public void remove()
232	{
233	throw new UnsupportedOperationException("Iterator.remove() not supported");
234	}
235
236	/**
237	* Move onto the next document in the collection to be processed.
238	* @return boolean true if there are more documents in the collection,
239	* otherwise return false.*/
240	public boolean nextDocument()
241	{
242	if (FileList.size() == 0)
243	return false;
244	boolean rtr = false;
245	thisFilename = null;
246	while(FileList.size() > 0 && ! rtr)
247	{
248	thisFilename = FileList.removeFirst();
249	logger.info("NEXT: "+thisFilename);
250
251	if (! Files.exists(thisFilename) \|\| ! Files.canRead(thisFilename) )
252	{
253	if (! Files.exists(thisFilename))
254	logger.warn("File doesn't exist: "+thisFilename);
255	else if (! Files.canRead(thisFilename) )
256	logger.warn("File cannot be read: "+thisFilename);
257	rtr = nextDocument();
258	}
259	else if (Files.isDirectory(thisFilename))
260	{
261	//we're allowed to recurse into directories
262	if(Recurse)
263	addDirectoryListing();
264	}
265	else
266	{ //this file is fine - use it!
267	//this block ensures that DocId is only increased once per file
268	Docid++;
269	rtr = true;
270	}
271	}//loop ends
272	return rtr;
273	}
274	/**
275	* Return the current document in the collection.
276	* @return Document the next document object from the collection.
277	*/
278	public Document getDocument()
279	{
280	InputStream in = null;
281	if (currentStream != null)
282	{
283	try{
284	currentStream.close();
285	currentStream = null;
286	}catch (IOException ioe) {
287	logger.warn("IOException while closing file being read", ioe);
288	}
289	}
290	if (thisFilename == null)
291	{
292	return null;
293	}
294	String filename = null;
295	try{
296	in = Files.openFileStream(thisFilename);
297	filename = thisFilename.replaceAll("\\.gz$","");
298	}catch(IOException ioe){
299	logger.warn("Problem reading "+thisFilename+" in "+
300	"SimpleFileCollection.getDocuent() : ",ioe);
301	}
302	currentStream = in;
303	return makeDocument(filename, in);
304
305	}
306
307
308	/** Given the opened document in, of Filename and File f, work out which
309	* parser to try, and instantiate it. If you wish to use a different
310	* constructor for opening documents, then you need to subclass this method.
311	* @param Filename the filename of the currently open document
312	* @param in The stream of the currently open document
313	* @return Document object to parse the document, or null if no suitable parser
314	* exists.*/
315	protected Document makeDocument(String Filename, InputStream in)
316	{
317	if (Filename == null \|\| in == null)
318	return null;
319	String[] splitStr = Filename.split("\\.");
320	String ext = splitStr[splitStr.length-1].toLowerCase();
321	Class<? extends Document> reader = extension_DocumentClass.get(ext);
322	Document rtr = null;
323
324	/*If a document doesn't have an associated parser,
325	check the default one */
326	if (reader == null) {
327	reader = extension_DocumentClass.get("\|DEFAULT\|");
328	}
329	/*if there is no default parser, then tough luck for that file,
330	but it's ignored */
331	if (reader == null) {
332	logger.warn("No available parser for file " + Filename + ", file is ignored.");
333	return null;
334	}
335	logger.debug("Using "+reader.getName() + " to read "+ Filename);
336
337	/* now attempt to instantiate the class */
338	try{
339	Map<String,String> docProperties = new HashMap<String,String>(5);
340	// [jmt12] I need the Docid in the Document instance
341	docProperties.put("docno", this.getDocid());
342	docProperties.put("filename", Filename);
343	//and instantiate
344	rtr = reader.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(in, docProperties, tokeniser);
345	indexedFiles.add(thisFilename);
346	}catch (OutOfMemoryError e){
347	logger.warn("Problem instantiating a document class; Out of memory error occured: ",e);
348	System.gc();
349	}catch (StackOverflowError e){
350	logger.warn("Problem instantiating a document class; Stack Overflow error occured: ",e);
351	}catch (Exception e){
352	logger.warn("Problem instantiating a document class: ",e);
353	}
354	rtr.getAllProperties().put("docno", this.getDocid());
355	return rtr;
356	}
357
358	/**
359	* Checks whether there are more documents in the colection.
360	* @return boolean true if there are no more documents in the collection,
361	* otherwise it returns false.
362	*/
363	public boolean endOfCollection()
364	{
365	return (FileList.size() == 0);
366	}
367
368	/**
369	* Starts again from the beginning of the collection.
370	*/
371	public void reset()
372	{
373	Docid = 0;
374	FileList = new LinkedList<String>(firstList);
375	indexedFiles = new ArrayList<String>();
376	}
377
378	/**
379	* Returns the current document's identifier string.
380	* @return String the identifier of the current document.
381	*/
382	public String getDocid()
383	{
384	return Docid+"";
385	}
386
387	@Override
388	public void close()
389	{
390	if (currentStream != null)
391	{
392	try{
393	currentStream.close();
394	} catch (IOException ioe) {
395	logger.error("Exception occured while trying to close an IO stream",ioe);
396	}
397	}
398	}
399
400	/** Returns the ist of indexed files in the order they were indexed in. */
401	public List<String> getFileList()
402	{
403	return indexedFiles;
404	}
405
406	/** Called when <tt>thisFile</tt> is identified as a directory, this adds the entire
407	* contents of the directory onto the list to be processed. */
408	protected void addDirectoryListing()
409	{
410	//File[] contents = thisFile.listFiles();
411	String[] dirContents = Files.list( thisFilename );
412	if (dirContents == null)
413	return;
414	for(String e : dirContents)
415	{
416	if (e.equals(".") \|\| e.equals(".."))
417	continue;
418	FileList.add(thisFilename + ApplicationSetup.FILE_SEPARATOR + e);
419	}
420	/*for(int i=0;i<contents.length;i++)
421	{
422	FileList.add(contents[i].getAbsolutePath());
423	}*/
424	}
425
426	/**
427	* Simple test case. Pass the filename of a file that lists files
428	* to be processed to this test case.
429	*/
430	public static void main(String[] args) {
431	Indexer in = new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
432	in.createDirectIndex(new Collection[] {new SimpleFileCollection(args[0])});
433	in.createInvertedIndex();
434	}
435
436	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: