Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gs2-extensions/video-and-audio/trunk/src/opt/Terrier/SimpleFileCollection.java@ 26209

Last change on this file since 26209 was 26209, checked in by jmt12, 12 years ago
Gah - forgot I'd customized the SimpleFileCollection to ensure docno is available in the initial properties of the document (before it is indexed)
File size: 14.9 KB

Rev	Line
[26209]	1	/*
	2	* Terrier - Terabyte Retriever
	3	* Webpage: http://terrier.org
	4	* Contact: terrier{a.}dcs.gla.ac.uk
	5	* University of Glasgow - School of Computing Science
	6	* http://www.gla.ac.uk/
	7	*
	8	* The contents of this file are subject to the Mozilla Public License
	9	* Version 1.1 (the "License"); you may not use this file except in
	10	* compliance with the License. You may obtain a copy of the License at
	11	* http://www.mozilla.org/MPL/
	12	*
	13	* Software distributed under the License is distributed on an "AS IS"
	14	* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
	15	* the License for the specific language governing rights and limitations
	16	* under the License.
	17	*
	18	* The Original Code is SimpleFileCollection.java.
	19	*
	20	* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
	21	* All Rights Reserved.
	22	*
	23	* Contributor(s):
	24	* Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
	25	* Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
	26	*/
	27	package org.terrier.indexing;
	28	import java.io.BufferedReader;
	29	import java.io.IOException;
	30	import java.io.InputStream;
	31	import java.util.ArrayList;
	32	import java.util.HashMap;
	33	import java.util.LinkedList;
	34	import java.util.List;
	35	import java.util.Map;
	36
	37	import org.apache.log4j.Logger;
	38	import org.terrier.indexing.tokenisation.Tokeniser;
	39	import org.terrier.utility.ApplicationSetup;
	40	import org.terrier.utility.Files;
	41	/**
	42	* Implements a collection that can read arbitrary files on disk. It will
	43	* use the file list given to it in the constructor, or it will read the
	44	* file specified by the property <tt>collection.spec</tt>.
	45	* <b>Properties:</b>
	46	* <ul>
	47	* <li><tt>indexing.simplefilecollection.extensionsparsers</tt> - a comma delimited lists of tuples, in the form "extension:DocumentClass".
	48	* For instance, one tuple could be "txt:FileDocument". The default <tt>txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument,pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument</tt>.
	49	* </li>
	50	* <li><tt>indexing.simplefilecollection.defaultparser</tt> - the default parser for any unknown extensions.
	51	* If this property is empty, then such documents will not be opened.</li>
	52	* <li><tt>indexing.simplefilecollection.recurse</tt> - whether directories should be opened looking for
	53	* files.</li>
	54	* </ul>
	55	* @author Craig Macdonald & Vassilis Plachouras
	56	*/
	57	public class SimpleFileCollection implements Collection/, DocumentExtractor/
	58	{
	59	protected static final Logger logger = Logger.getLogger(SimpleFileCollection.class);
	60	/** The default namespace for all parsers to be loaded from. Only used if
	61	* the class name specified does not contain any periods ('.') */
	62	public final static String NAMESPACE_DOCUMENTS = "org.terrier.indexing.";
	63
	64	/** The list of files to index.*/
	65	protected LinkedList<String> FileList = new LinkedList<String>();
	66
	67	/** Contains the list of files first handed to the SimpleFileCollection, allowing
	68	* the SimpleFileCollection instance to be simply reset. */
	69	protected List<String> firstList;
	70
	71	/** This is filled during traversal, so document IDs can be matched with filenames */
	72	protected List<String> indexedFiles = new ArrayList<String>();
	73
	74	/** The identifier of a document in the collection.*/
	75	protected int Docid = 0;
	76
	77	/** Whether directories should be recursed into by this class */
	78	protected boolean Recurse = Boolean.parseBoolean(ApplicationSetup.getProperty("indexing.simplefilecollection.recurse", "true"));
	79
	80	/** Maps filename extensions to Document classes.
	81	* The entry \|DEFAULT\| maps to the default document parser, specified
	82	* by <tt>indexing.simplefilecollection.defaultparser</tt> */
	83	protected Map<String,Class<? extends Document>> extension_DocumentClass = new HashMap<String,Class<? extends Document>>();
	84
	85	/** The filename of the current file we are processing. */
	86	protected String thisFilename;
	87
	88	/** The InputStream of the most recently opened document. This
	89	* is used to ensure that files are closed once they have been
	90	* finished reading. */
	91	protected InputStream currentStream = null;
	92
	93	protected Tokeniser tokeniser = Tokeniser.getTokeniser();
	94
	95	/**
	96	* Constructs an instance of the class with the given list of files.
	97	* @param filelist ArrayList the files to be processed by this collection.
	98	*/
	99	public SimpleFileCollection(List<String> filelist, boolean recurse) {
	100	FileList = new LinkedList<String>(filelist);
	101	//keep a backup copy for reset()
	102	firstList = new LinkedList<String>(filelist);
	103	createExtensionDocumentMapping();
	104	}
	105
	106	/**
	107	* A default constructor that uses the files to be processed
	108	* by this collection, as specified by the property
	109	* <tt>collection.spec</tt>
	110	*/
	111	public SimpleFileCollection()
	112	{
	113	this(ApplicationSetup.COLLECTION_SPEC);
	114	}
	115
	116
	117	/**
	118	* Creates an instance of the class. The files to be processed are
	119	* specified in the file with the given name.
	120	* @param addressCollectionFilename String the name of the file that
	121	* contains the list of files to be processed by this collecion.
	122	*/
	123	public SimpleFileCollection(String addressCollectionFilename)
	124	{
	125	ArrayList<String> generatedFileList = new ArrayList<String>();
	126	try{
	127	//opening the address_collection file
	128	BufferedReader br = Files.openFileReader(addressCollectionFilename);
	129	//iterate through each entry of the address_collection file
	130	String filename = br.readLine();
	131	while (filename != null) {
	132	//if the line starts with #, then assume it is
	133	//a comment and proceed to the next one
	134	if (filename.startsWith("#")) {
	135	filename = br.readLine();
	136	continue;
	137	}
	138	if(logger.isDebugEnabled()){
	139	logger.debug("Added "+filename+" to filelist for SimpleFileCollection");
	140	}
	141	generatedFileList.add(filename);
	142	filename = br.readLine();
	143	}
	144
	145	}catch(IOException ioe) {
	146	logger.error("problem opening address list of files in SimpleFileCollectio: ",ioe);
	147	}
	148	FileList = new LinkedList<String>(generatedFileList);
	149	firstList = new LinkedList<String>(generatedFileList);
	150	createExtensionDocumentMapping();
	151	}
	152
	153
	154	/** Parses the properties <tt>indexing.simplefilecollection.extensionsparsers</tt>
	155	* and <tt>indexing.simplefilecollection.defaultparser</tt> and attempts to load
	156	* all the mentioned classes, in a hashtable mapping filename extension to their
	157	* respective parsers. If <tt>indexing.simplefilecollection.defaultparser</tt>
	158	* is set, then that class will be used to attempt to parse documents that no
	159	* explicit parser is set. */
	160	protected void createExtensionDocumentMapping()
	161	{
	162	String staticMappings = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers",
	163	"txt:FileDocument,text:FileDocument,tex:FileDocument,bib:FileDocument," +
	164	"pdf:PDFDocument,html:TaggedDocument,htm:TaggedDocument,xhtml:TaggedDocument,xml:TaggedDocument,"+
	165	"doc:MSWordDocument,ppt:MSPowerpointDocument,xls:MSExcelDocument");
	166	String defaultMapping = ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","");
	167	if (staticMappings.length() > 0)
	168	{
	169	String[] mappings = staticMappings.split("\\s,\\s");
	170	for(int i=0;i<mappings.length;i++)
	171	{
	172	if (mappings[i].indexOf(":") < 1)
	173	continue;
	174	String[] mapping = mappings[i].split(":");
	175	if (mapping.length == 2 && mapping[0].length() > 0
	176	&& mapping[1].length() > 0)
	177	{
	178	if (mapping[1].indexOf(".") == -1)
	179	mapping[1] = NAMESPACE_DOCUMENTS + mapping[1];
	180	else if (mapping[1].startsWith("uk.ac.gla.terrier"))
	181	mapping[1] = mapping[1].replaceAll("uk.ac.gla.terrier", "org.terrier");
	182	try{
	183	Class<? extends Document> d = Class.forName(mapping[1], false, this.getClass().getClassLoader()).asSubclass(Document.class);
	184	extension_DocumentClass.put(mapping[0].toLowerCase(), d);
	185	}catch (Exception e){
	186	/warning, just ignore /
	187	logger.warn("Missing class " + mapping[1] + " for " +
	188	mapping[0].toLowerCase() + " files.",e);
	189	}
	190	}
	191	}
	192	}
	193	//set the mapping for the default parser
	194	if (!defaultMapping.equals("")) {
	195	if (defaultMapping.indexOf(".") == -1)
	196	defaultMapping = NAMESPACE_DOCUMENTS + defaultMapping;
	197	else if (defaultMapping.startsWith("uk.ac.gla.terrier"))
	198	defaultMapping = defaultMapping.replaceAll("uk.ac.gla.terrier", "org.terrier");
	199
	200	try{
	201	Class<? extends Document> d = Class.forName(defaultMapping, false, this.getClass().getClassLoader()).asSubclass(Document.class);
	202	extension_DocumentClass.put("\|DEFAULT\|", d);
	203	}catch (Exception e){
	204	logger.warn("Missing default class " + defaultMapping, e);
	205	}
	206	}
	207	}
	208
	209	/**
	210	* Check whether there is a next document in the collection to be processed
	211	* @return has next
	212	*/
	213	public boolean hasNext() {
	214	return ! endOfCollection();
	215	}
	216
	217	/**
	218	* Move onto the next document in the collection to be processed.
	219	* @return next document
	220	*/
	221	public Document next()
	222	{
	223	nextDocument();
	224	return getDocument();
	225	}
	226
	227	/**
	228	* This is unsupported by this Collection implementation, and
	229	* any calls will throw UnsupportedOperationException
	230	* Throws UnsupportedOperationException on all invocations */
	231	public void remove()
	232	{
	233	throw new UnsupportedOperationException("Iterator.remove() not supported");
	234	}
	235
	236	/**
	237	* Move onto the next document in the collection to be processed.
	238	* @return boolean true if there are more documents in the collection,
	239	* otherwise return false.*/
	240	public boolean nextDocument()
	241	{
	242	if (FileList.size() == 0)
	243	return false;
	244	boolean rtr = false;
	245	thisFilename = null;
	246	while(FileList.size() > 0 && ! rtr)
	247	{
	248	thisFilename = FileList.removeFirst();
	249	logger.info("NEXT: "+thisFilename);
	250
	251	if (! Files.exists(thisFilename) \|\| ! Files.canRead(thisFilename) )
	252	{
	253	if (! Files.exists(thisFilename))
	254	logger.warn("File doesn't exist: "+thisFilename);
	255	else if (! Files.canRead(thisFilename) )
	256	logger.warn("File cannot be read: "+thisFilename);
	257	rtr = nextDocument();
	258	}
	259	else if (Files.isDirectory(thisFilename))
	260	{
	261	//we're allowed to recurse into directories
	262	if(Recurse)
	263	addDirectoryListing();
	264	}
	265	else
	266	{ //this file is fine - use it!
	267	//this block ensures that DocId is only increased once per file
	268	Docid++;
	269	rtr = true;
	270	}
	271	}//loop ends
	272	return rtr;
	273	}
	274	/**
	275	* Return the current document in the collection.
	276	* @return Document the next document object from the collection.
	277	*/
	278	public Document getDocument()
	279	{
	280	InputStream in = null;
	281	if (currentStream != null)
	282	{
	283	try{
	284	currentStream.close();
	285	currentStream = null;
	286	}catch (IOException ioe) {
	287	logger.warn("IOException while closing file being read", ioe);
	288	}
	289	}
	290	if (thisFilename == null)
	291	{
	292	return null;
	293	}
	294	String filename = null;
	295	try{
	296	in = Files.openFileStream(thisFilename);
	297	filename = thisFilename.replaceAll("\\.gz$","");
	298	}catch(IOException ioe){
	299	logger.warn("Problem reading "+thisFilename+" in "+
	300	"SimpleFileCollection.getDocuent() : ",ioe);
	301	}
	302	currentStream = in;
	303	return makeDocument(filename, in);
	304
	305	}
	306
	307
	308	/** Given the opened document in, of Filename and File f, work out which
	309	* parser to try, and instantiate it. If you wish to use a different
	310	* constructor for opening documents, then you need to subclass this method.
	311	* @param Filename the filename of the currently open document
	312	* @param in The stream of the currently open document
	313	* @return Document object to parse the document, or null if no suitable parser
	314	* exists.*/
	315	protected Document makeDocument(String Filename, InputStream in)
	316	{
	317	if (Filename == null \|\| in == null)
	318	return null;
	319	String[] splitStr = Filename.split("\\.");
	320	String ext = splitStr[splitStr.length-1].toLowerCase();
	321	Class<? extends Document> reader = extension_DocumentClass.get(ext);
	322	Document rtr = null;
	323
	324	/*If a document doesn't have an associated parser,
	325	check the default one */
	326	if (reader == null) {
	327	reader = extension_DocumentClass.get("\|DEFAULT\|");
	328	}
	329	/*if there is no default parser, then tough luck for that file,
	330	but it's ignored */
	331	if (reader == null) {
	332	logger.warn("No available parser for file " + Filename + ", file is ignored.");
	333	return null;
	334	}
	335	logger.debug("Using "+reader.getName() + " to read "+ Filename);
	336
	337	/* now attempt to instantiate the class */
	338	try{
	339	Map<String,String> docProperties = new HashMap<String,String>(5);
	340	// [jmt12] I need the Docid in the Document instance
	341	docProperties.put("docno", this.getDocid());
	342	docProperties.put("filename", Filename);
	343	//and instantiate
	344	rtr = reader.getConstructor(InputStream.class, Map.class, Tokeniser.class).newInstance(in, docProperties, tokeniser);
	345	indexedFiles.add(thisFilename);
	346	}catch (OutOfMemoryError e){
	347	logger.warn("Problem instantiating a document class; Out of memory error occured: ",e);
	348	System.gc();
	349	}catch (StackOverflowError e){
	350	logger.warn("Problem instantiating a document class; Stack Overflow error occured: ",e);
	351	}catch (Exception e){
	352	logger.warn("Problem instantiating a document class: ",e);
	353	}
	354	rtr.getAllProperties().put("docno", this.getDocid());
	355	return rtr;
	356	}
	357
	358	/**
	359	* Checks whether there are more documents in the colection.
	360	* @return boolean true if there are no more documents in the collection,
	361	* otherwise it returns false.
	362	*/
	363	public boolean endOfCollection()
	364	{
	365	return (FileList.size() == 0);
	366	}
	367
	368	/**
	369	* Starts again from the beginning of the collection.
	370	*/
	371	public void reset()
	372	{
	373	Docid = 0;
	374	FileList = new LinkedList<String>(firstList);
	375	indexedFiles = new ArrayList<String>();
	376	}
	377
	378	/**
	379	* Returns the current document's identifier string.
	380	* @return String the identifier of the current document.
	381	*/
	382	public String getDocid()
	383	{
	384	return Docid+"";
	385	}
	386
	387	@Override
	388	public void close()
	389	{
	390	if (currentStream != null)
	391	{
	392	try{
	393	currentStream.close();
	394	} catch (IOException ioe) {
	395	logger.error("Exception occured while trying to close an IO stream",ioe);
	396	}
	397	}
	398	}
	399
	400	/** Returns the ist of indexed files in the order they were indexed in. */
	401	public List<String> getFileList()
	402	{
	403	return indexedFiles;
	404	}
	405
	406	/** Called when <tt>thisFile</tt> is identified as a directory, this adds the entire
	407	* contents of the directory onto the list to be processed. */
	408	protected void addDirectoryListing()
	409	{
	410	//File[] contents = thisFile.listFiles();
	411	String[] dirContents = Files.list( thisFilename );
	412	if (dirContents == null)
	413	return;
	414	for(String e : dirContents)
	415	{
	416	if (e.equals(".") \|\| e.equals(".."))
	417	continue;
	418	FileList.add(thisFilename + ApplicationSetup.FILE_SEPARATOR + e);
	419	}
	420	/*for(int i=0;i<contents.length;i++)
	421	{
	422	FileList.add(contents[i].getAbsolutePath());
	423	}*/
	424	}
	425
	426	/**
	427	* Simple test case. Pass the filename of a file that lists files
	428	* to be processed to this test case.
	429	*/
	430	public static void main(String[] args) {
	431	Indexer in = new BasicIndexer(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
	432	in.createDirectIndex(new Collection[] {new SimpleFileCollection(args[0])});
	433	in.createInvertedIndex();
	434	}
	435
	436	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: