package org.terrier.applications; import java.io.BufferedReader; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.IOException; import java.io.PrintWriter; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.List; import java.util.HashSet; import java.util.Iterator; import org.apache.log4j.Logger; import org.terrier.indexing.BlockIndexer; import org.terrier.indexing.BlockSinglePassIndexer; import org.terrier.indexing.Collection; import org.terrier.indexing.Indexer; import org.terrier.indexing.SimpleFileCollection; import org.terrier.structures.CollectionStatistics; import org.terrier.structures.Index; import org.terrier.structures.merging.StructureMerger; import org.terrier.utility.ApplicationSetup; /** @class FileIndexer * * A simple indexer, based on DesktopTerrier, that is intended to build a * searchable index from the contents of a directory in the file system. This * indexer has been created with parallel processing in mind, in that you can * split a large or computationally complex directory into several batches * (the contents of which are recorded by manifest files) and then process * each batch independently (and in parallel on a multicore machine or a * cluster). Once this is complete you can then merge each pair of indexes * together - eventually generating a final index encompassing all of the * batches parts. * * Start by preparing the collection for indexing by locating indexable files * and recording them in one or more manifest files, i.e.: * * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \ * -path /data/mycollection/ -batchsize 10 * * You then build the uniquely named index (in this case with a prefix of * '000') for each manifest, i.e.: * * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \ * -path /terrier/var/manifest-000.spec -prefix 000 * * If there are more index (for example, with prefixes '000' and '001'), you * can then merge them (into a single index with the prefix 'data') using this * command: * * ./bin/anyclass.sh \ * org.terrier.applications.structures.merging.BlockStructureMerger \ * /terrier/var/index/ 000 /terrier/var/index/ 001 \ * /terrier/var/index/ data * */ public class FileIndexer { private String index_prefix; private HashSet supported_extensions; private List file_list; protected static final Logger logger = Logger.getLogger(FileIndexer.class); /** @function FileIndexer * Default constructor */ public FileIndexer(String index_prefix) { logger.info("FileIndexer::FileIndexer(" + index_prefix + ")"); this.index_prefix = index_prefix; this.supported_extensions = new HashSet(); this.file_list = new ArrayList(); //setting properties for the application if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null) { ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename"); ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048"); } ApplicationSetup.setProperty("indexing.max.tokens", "10000"); ApplicationSetup.setProperty("invertedfile.processterms","25000"); ApplicationSetup.setProperty("ignore.low.idf.terms","false"); ApplicationSetup.setProperty("matching.dsms", "BooleanFallback"); // we need to know what extensions are supported by the indexer String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument"); // parse up the string, and store the extensions (only) in the HashSet if (extension_parsers_raw.length() > 0) { String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*"); // for each parser, of the form: : for (int i = 0; i < extension_parsers.length; i++) { String extension_parser = extension_parsers[i]; String[] mapping = extension_parser.split(":"); // a well-formed mapping has an extension and a document class if (mapping.length == 2) { // we store just the extension, leaving the actual mapping up to // SimpleFileCollection supported_extensions.add(mapping[0]); } else { logger.warn("Malformed extension parser mapping: " + extension_parser); } } } else { logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable"); } } /** FileIndexer() **/ /** @function canIndex * Given a path to a file, determine if Terrier's current configuration * allows that file to be indexed */ public boolean canIndex(String file_path) { logger.info("FileIndexer::canIndex(" + file_path.toString() + ")"); // we may have specified a default Document class for all files encountered if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals("")) { return true; } // otherwise, check through the list of supported file extensions to see // if one matches this file // - get the file's extension String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1); // - see if it exists in the array of supported extensions return this.supported_extensions.contains(file_extension); } /** canIndex(String) **/ /** @function close */ public void close() { logger.info("FileIndexer::close()"); this.file_list.clear(); this.file_list = null; this.supported_extensions.clear(); this.supported_extensions = null; } /** close() **/ /** @function deleteIndex */ public void deleteIndex(String prefix) { logger.info("FileIndexer::deleteIndex(" + prefix + ")"); // and that all old files for this index are removed File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); File files[] = index_path.listFiles(); for (int i = 0; i < files.length; i++) { String file_name = files[i].getName(); if (file_name.startsWith(prefix)) { files[i].delete(); } } } /** deleteIndex(String) **/ /** @function listIndexes */ public ArrayDeque listIndexes(boolean include_default) { logger.info("FileIndexer::listIndexes(" + include_default + ")"); File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); // we start by populating a set structure to ensure each prefix only occurs // once HashSet index_parts_set = new HashSet(); File files[] = index_path.listFiles(); for (int i = 0; i < files.length; i++) { String file_name = files[i].getName(); if (include_default || !file_name.startsWith(default_index_prefix)) { String prefix = file_name.substring(0, file_name.indexOf(".")); index_parts_set.add(prefix); } } // we then turn the hashset into a nice list (in this case a deque) ArrayDeque index_parts = new ArrayDeque(); Iterator index_parts_iterator = index_parts_set.iterator(); while (index_parts_iterator.hasNext()) { index_parts.add(index_parts_iterator.next()); } return index_parts; } /** listIndexes(boolean) **/ /** */ public void loadManifest(Path manifest_path) { logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")"); try { BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile()))); String line = ""; while ((line = manifest_reader.readLine()) != null) { this.file_list.add(line); } manifest_reader.close(); manifest_reader = null; } catch (IOException e) { logger.error("Exception when reading manifest! " + e); } } /** loadManifest(Path) **/ /** @function mergeIndexes */ public boolean mergeIndexes(String prefix_one, String prefix_two) { logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")"); String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data"); return this.mergeIndexes(prefix_one, prefix_two, prefix_default); } /** mergeIndexes(String, String) **/ /** @function mergeIndexes */ public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out) { logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")"); // init String index_path = ApplicationSetup.TERRIER_INDEX_PATH; // use StructureMerger class Index.setIndexLoadingProfileAsRetrieval(false); Index index_one = Index.createIndex(index_path, prefix_one); Index index_two = Index.createIndex(index_path, prefix_two); Index index_out = Index.createNewIndex(index_path, prefix_out); StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out); structure_merger.mergeStructures(); structure_merger = null; CollectionStatistics collection_statistics = index_out.getCollectionStatistics(); logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments()); logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens()); logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms()); logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers()); collection_statistics = null; try { index_one.close(); index_two.close(); index_out.close(); } catch (IOException e) { logger.error("Exception while closing indexes: ", e); return false; } index_one = null; index_two = null; index_out = null; this.deleteIndex(prefix_one); this.deleteIndex(prefix_two); prefix_one = null; prefix_two = null; return true; } /** mergeIndexes(String, String, String) **/ /** @function renameIndex */ public void renameIndex(String prefix_in) { logger.info("FileIndexer::renameIndex(" + prefix_in + ")"); String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); this.renameIndex(prefix_in, default_prefix); } /** renameIndex(String) **/ /** @function renameIndex */ public void renameIndex(String prefix_in, String prefix_out) { logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")"); prefix_in = prefix_in + "."; File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); File files[] = index_path.listFiles(); for (int i = 0; i < files.length; i++) { File a_file = files[i]; String file_name = a_file.getName(); if (file_name.startsWith(prefix_in)) { String suffix = file_name.substring(file_name.indexOf(".")); Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix); try { Files.move(a_file.toPath(), target_path); } catch (IOException e) { System.err.println("Error! Failed to rename file: " + e); } target_path = null; suffix = null; } file_name = null; a_file = null; } files = null; index_path = null; } /** renameIndex(String, String) **/ /** @function runIndex */ public void runIndex() { logger.info("FileIndexer::runIndex()"); if (this.file_list == null || this.file_list.size() == 0) { logger.error("No files specified to index. Aborting indexing process."); return; } try { // ensure the index directory exists File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH); if (!index_path.exists() && !index_path.mkdirs()) { logger.error("Could not create the index folders at: "+ index_path); logger.error("Aborting indexing process"); return; } this.deleteIndex(this.index_prefix); // create the appropriate indexer Indexer indexer; final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false")); logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection"); if (use_single_pass) { indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix); } else { indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix); } // create the simple file collection object and hand it to the indexer // for indexing SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false); indexer.index(new Collection[] { sfc }); Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix); // output some indexing statistics if (the_index != null) { CollectionStatistics collection_statistics = the_index.getCollectionStatistics(); logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments()); logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens()); logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms()); logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers()); collection_statistics = null; } else { logger.warn("Nothing indexed!"); } } catch(Exception e) { logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e); } logger.info("FileIndexer::runIndex() - Complete!"); } /** runIndex(List) **/ /** @function main */ public static void main(String[] args) { System.out.println("================================= FileIndexer ================================="); // 0. Initialization final int PREPARE = 0; final int INDEX = 1; final int MERGE = 2; int index_mode = PREPARE; Path a_path = null; String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); int batch_size = 0; // No limit String divider = "==============================================================================="; // 1. Parse arguments // - first argument is the mode (prepare|index|merge) if (args.length < 1) { System.out.println("Usage: -prepare -path [-batchsize ]"); System.out.println(" -index -path [-prefix ]"); System.out.println(" -merge"); System.out.println("where: paths *must* be absolute"); System.out.println(" when preparing 'path' is to the collection directory"); System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)"); System.out.println(" prefix is 'data' by default"); System.out.println(" batch_size is unlimited by default"); System.exit(0); } if (args[0].equals("-index")) { index_mode = INDEX; } else if (args[0].equals("-merge")) { index_mode = MERGE; } // - all other arguments for (int argc = 1; (argc + 1) < args.length; argc += 2) { String key = args[argc]; String value = args[argc + 1]; if (key.equals("-batchsize")) { batch_size = Integer.parseInt(value); } else if (key.equals("-path")) { a_path = Paths.get(value); } else if (key.equals("-prefix")) { index_prefix = value; } else { System.err.println("Warning! Unknown argument: " + key); } } // - check arguments if (index_mode != MERGE && (a_path == null || !Files.exists(a_path))) { System.err.println("Error! Required argument -path not set or invalid."); System.err.println(""); System.exit(0); } // 2. Create the file indexer FileIndexer file_indexer = new FileIndexer(index_prefix); switch(index_mode) { case PREPARE: // 3a. If we are in preparation mode, this is where we go through the // files in the search path, adding those that are indexable by // Terrier's current configuration into one of more manifest files // based upon the value of batch_size. System.out.println("Mode: Preparation"); System.out.println("Collection Path: " + a_path.toString()); if (batch_size != 0) { System.out.println("Batch Size: " + batch_size); } System.out.println(divider); // - we're going to perform a breadth-first recursive search for files // using the canIndex() function of the indexer to determine if this // is (or would be) a file to index ArrayDeque search_paths = new ArrayDeque(); search_paths.add(a_path); int file_count = 0; int manifest_count = 0; PrintWriter manifest_writer = null; while (!search_paths.isEmpty()) { Path search_path = search_paths.remove(); File files[] = search_path.toFile().listFiles(); for (int i = 0; i < files.length; i++) { Path file_path = files[i].toPath(); String file_name = file_path.getFileName().toString(); // skip . and .. if (file_name.equals(".") || file_name.equals("..")) { // do nothing } // if this is a directory, add it to the queue of paths to search else if (Files.isDirectory(file_path)) { search_paths.add(file_path); } else { // would we expect to index this file? if (file_indexer.canIndex(file_name)) { // Ensure we opened a file handle for writing this manifest if (manifest_writer == null) { Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec"); try { manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile()))); } catch (IOException e) { System.err.println("Error when opening manifest for writing! " + e); } System.out.println("* Writing to manifest: " + manifest_path.toString()); } manifest_writer.println(file_path.toString()); file_count++; // if we have a file limit, and we've found enough files... if (manifest_writer != null && batch_size > 0 && file_count >= batch_size) { manifest_writer.close(); manifest_writer = null; manifest_count++; } } } } } if (manifest_writer != null) { manifest_writer.close(); manifest_writer = null; } search_paths.clear(); search_paths = null; break; case INDEX: // 3b. Load the manifest specified by the search path and index files System.out.println("Mode: Index"); System.out.println("Manifest Path: " + a_path.toString()); System.out.println("Prefix: " + index_prefix); System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename")); System.out.println(divider); file_indexer.loadManifest(a_path); file_indexer.runIndex(); break; case MERGE: System.out.println("Mode: Merge"); System.out.println(divider); // 3c. User has requested a merging process. logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started"); // Look in the default index path and locate all of the indexes to merge // (all those not prefixed 'data') ArrayDeque index_parts = file_indexer.listIndexes(false); // We merge the indexes two at a time, creating new intermediate indexes, // while there are still more than two indexes left in the queue. We // delete indexes that we have already merged. int intermediate_file_counter = 0; while (index_parts.size() > 2) { String index_part_one_prefix = index_parts.remove(); String index_part_two_prefix = index_parts.remove(); String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter); logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\""); file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix); index_part_one_prefix = null; index_part_two_prefix = null; // - add the intermediate index to the list of indexes to be merged index_parts.add(intermediate_index_prefix); // - and increment the intermediate file count for the next iteration intermediate_file_counter++; } // If there are two left we merge them into the final index if (index_parts.size() == 2) { String index_part_one_prefix = index_parts.remove(); String index_part_two_prefix = index_parts.remove(); logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index"); file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix); index_part_one_prefix = null; index_part_two_prefix = null; } // Otherwise we just rename the files that are there into the final // index else if (index_parts.size() == 1) { String index_part_prefix = index_parts.remove(); logger.info("Renaming index \"" + index_part_prefix + "\" into final index"); file_indexer.renameIndex(index_part_prefix); index_part_prefix = null; } // no index parts detected - was merge run by mistake? else { logger.warn("No index parts detected - merge has no effect"); } // clean up index_parts.clear(); index_parts = null; logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!"); break; default: logger.warn("Unknown FileIndexer mode requested"); } // 4. Finish indexing file_indexer.close(); file_indexer = null; // 5. Complete! System.out.println(divider); System.out.println("Complete!"); System.out.println(divider); System.out.println(""); } /** main() **/ }