[26187] | 1 | package org.terrier.applications;
|
---|
| 2 |
|
---|
| 3 | import java.io.BufferedReader;
|
---|
| 4 | import java.io.BufferedOutputStream;
|
---|
| 5 | import java.io.File;
|
---|
| 6 | import java.io.FileInputStream;
|
---|
| 7 | import java.io.FileOutputStream;
|
---|
| 8 | import java.io.InputStreamReader;
|
---|
| 9 | import java.io.IOException;
|
---|
| 10 | import java.io.PrintWriter;
|
---|
| 11 | import java.nio.file.Files;
|
---|
| 12 | import java.nio.file.Path;
|
---|
| 13 | import java.nio.file.Paths;
|
---|
| 14 | import java.util.ArrayDeque;
|
---|
| 15 | import java.util.ArrayList;
|
---|
| 16 | import java.util.List;
|
---|
| 17 | import java.util.HashSet;
|
---|
| 18 | import java.util.Iterator;
|
---|
| 19 |
|
---|
| 20 | import org.apache.log4j.Logger;
|
---|
| 21 |
|
---|
| 22 | import org.terrier.indexing.BlockIndexer;
|
---|
| 23 | import org.terrier.indexing.BlockSinglePassIndexer;
|
---|
| 24 | import org.terrier.indexing.Collection;
|
---|
| 25 | import org.terrier.indexing.Indexer;
|
---|
| 26 | import org.terrier.indexing.SimpleFileCollection;
|
---|
| 27 | import org.terrier.structures.CollectionStatistics;
|
---|
| 28 | import org.terrier.structures.Index;
|
---|
| 29 | import org.terrier.structures.merging.StructureMerger;
|
---|
| 30 | import org.terrier.utility.ApplicationSetup;
|
---|
| 31 |
|
---|
| 32 | /** @class FileIndexer
|
---|
| 33 | *
|
---|
| 34 | * A simple indexer, based on DesktopTerrier, that is intended to build a
|
---|
| 35 | * searchable index from the contents of a directory in the file system. This
|
---|
| 36 | * indexer has been created with parallel processing in mind, in that you can
|
---|
| 37 | * split a large or computationally complex directory into several batches
|
---|
| 38 | * (the contents of which are recorded by manifest files) and then process
|
---|
| 39 | * each batch independently (and in parallel on a multicore machine or a
|
---|
| 40 | * cluster). Once this is complete you can then merge each pair of indexes
|
---|
| 41 | * together - eventually generating a final index encompassing all of the
|
---|
| 42 | * batches parts.
|
---|
| 43 | *
|
---|
| 44 | * Start by preparing the collection for indexing by locating indexable files
|
---|
| 45 | * and recording them in one or more manifest files, i.e.:
|
---|
| 46 | *
|
---|
| 47 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
|
---|
| 48 | * -path /data/mycollection/ -batchsize 10
|
---|
| 49 | *
|
---|
| 50 | * You then build the uniquely named index (in this case with a prefix of
|
---|
| 51 | * '000') for each manifest, i.e.:
|
---|
| 52 | *
|
---|
| 53 | * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
|
---|
| 54 | * -path /terrier/var/manifest-000.spec -prefix 000
|
---|
| 55 | *
|
---|
| 56 | * If there are more index (for example, with prefixes '000' and '001'), you
|
---|
| 57 | * can then merge them (into a single index with the prefix 'data') using this
|
---|
| 58 | * command:
|
---|
| 59 | *
|
---|
| 60 | * ./bin/anyclass.sh \
|
---|
| 61 | * org.terrier.applications.structures.merging.BlockStructureMerger \
|
---|
| 62 | * /terrier/var/index/ 000 /terrier/var/index/ 001 \
|
---|
| 63 | * /terrier/var/index/ data
|
---|
| 64 | *
|
---|
| 65 | */
|
---|
| 66 | public class FileIndexer
|
---|
| 67 | {
|
---|
| 68 | private String index_prefix;
|
---|
| 69 |
|
---|
| 70 | private HashSet<String> supported_extensions;
|
---|
| 71 |
|
---|
| 72 | private List<String> file_list;
|
---|
| 73 |
|
---|
| 74 | protected static final Logger logger = Logger.getLogger(FileIndexer.class);
|
---|
| 75 |
|
---|
| 76 | /** @function FileIndexer
|
---|
| 77 | * Default constructor
|
---|
| 78 | */
|
---|
| 79 | public FileIndexer(String index_prefix)
|
---|
| 80 | {
|
---|
| 81 | logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
|
---|
| 82 | this.index_prefix = index_prefix;
|
---|
| 83 | this.supported_extensions = new HashSet<String>();
|
---|
| 84 | this.file_list = new ArrayList<String>();
|
---|
| 85 |
|
---|
| 86 | //setting properties for the application
|
---|
| 87 | if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
|
---|
| 88 | {
|
---|
| 89 | ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
|
---|
| 90 | ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
|
---|
| 91 | }
|
---|
| 92 | ApplicationSetup.setProperty("indexing.max.tokens", "10000");
|
---|
| 93 | ApplicationSetup.setProperty("invertedfile.processterms","25000");
|
---|
| 94 | ApplicationSetup.setProperty("ignore.low.idf.terms","false");
|
---|
| 95 | ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
|
---|
| 96 |
|
---|
| 97 | // we need to know what extensions are supported by the indexer
|
---|
| 98 | String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
|
---|
| 99 | // parse up the string, and store the extensions (only) in the HashSet
|
---|
| 100 | if (extension_parsers_raw.length() > 0)
|
---|
| 101 | {
|
---|
| 102 | String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
|
---|
| 103 | // for each parser, of the form: <ext>:<documentclass>
|
---|
| 104 | for (int i = 0; i < extension_parsers.length; i++)
|
---|
| 105 | {
|
---|
| 106 | String extension_parser = extension_parsers[i];
|
---|
| 107 | String[] mapping = extension_parser.split(":");
|
---|
| 108 | // a well-formed mapping has an extension and a document class
|
---|
| 109 | if (mapping.length == 2)
|
---|
| 110 | {
|
---|
| 111 | // we store just the extension, leaving the actual mapping up to
|
---|
| 112 | // SimpleFileCollection
|
---|
| 113 | supported_extensions.add(mapping[0]);
|
---|
| 114 | }
|
---|
| 115 | else
|
---|
| 116 | {
|
---|
| 117 | logger.warn("Malformed extension parser mapping: " + extension_parser);
|
---|
| 118 | }
|
---|
| 119 | }
|
---|
| 120 | }
|
---|
| 121 | else
|
---|
| 122 | {
|
---|
| 123 | logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
|
---|
| 124 | }
|
---|
| 125 | }
|
---|
| 126 | /** FileIndexer() **/
|
---|
| 127 |
|
---|
| 128 | /** @function canIndex
|
---|
| 129 | * Given a path to a file, determine if Terrier's current configuration
|
---|
| 130 | * allows that file to be indexed
|
---|
| 131 | */
|
---|
| 132 | public boolean canIndex(String file_path)
|
---|
| 133 | {
|
---|
| 134 | logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");
|
---|
| 135 | // we may have specified a default Document class for all files encountered
|
---|
| 136 | if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
|
---|
| 137 | {
|
---|
| 138 | return true;
|
---|
| 139 | }
|
---|
| 140 | // otherwise, check through the list of supported file extensions to see
|
---|
| 141 | // if one matches this file
|
---|
| 142 | // - get the file's extension
|
---|
| 143 | String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
|
---|
| 144 | // - see if it exists in the array of supported extensions
|
---|
| 145 | return this.supported_extensions.contains(file_extension);
|
---|
| 146 | }
|
---|
| 147 | /** canIndex(String) **/
|
---|
| 148 |
|
---|
| 149 | /** @function close
|
---|
| 150 | */
|
---|
| 151 | public void close()
|
---|
| 152 | {
|
---|
| 153 | logger.info("FileIndexer::close()");
|
---|
| 154 | this.file_list.clear();
|
---|
| 155 | this.file_list = null;
|
---|
| 156 | this.supported_extensions.clear();
|
---|
| 157 | this.supported_extensions = null;
|
---|
| 158 | }
|
---|
| 159 | /** close() **/
|
---|
| 160 |
|
---|
| 161 | /** @function deleteIndex
|
---|
| 162 | */
|
---|
| 163 | public void deleteIndex(String prefix)
|
---|
| 164 | {
|
---|
| 165 | logger.info("FileIndexer::deleteIndex(" + prefix + ")");
|
---|
| 166 | // and that all old files for this index are removed
|
---|
| 167 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
| 168 | File files[] = index_path.listFiles();
|
---|
| 169 | for (int i = 0; i < files.length; i++)
|
---|
| 170 | {
|
---|
| 171 | String file_name = files[i].getName();
|
---|
| 172 | if (file_name.startsWith(prefix))
|
---|
| 173 | {
|
---|
| 174 | files[i].delete();
|
---|
| 175 | }
|
---|
| 176 | }
|
---|
| 177 | }
|
---|
| 178 | /** deleteIndex(String) **/
|
---|
| 179 |
|
---|
| 180 | /** @function listIndexes
|
---|
| 181 | */
|
---|
| 182 | public ArrayDeque<String> listIndexes(boolean include_default)
|
---|
| 183 | {
|
---|
| 184 | logger.info("FileIndexer::listIndexes(" + include_default + ")");
|
---|
| 185 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
| 186 | String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
| 187 | // we start by populating a set structure to ensure each prefix only occurs
|
---|
| 188 | // once
|
---|
| 189 | HashSet<String> index_parts_set = new HashSet<String>();
|
---|
| 190 | File files[] = index_path.listFiles();
|
---|
| 191 | for (int i = 0; i < files.length; i++)
|
---|
| 192 | {
|
---|
| 193 | String file_name = files[i].getName();
|
---|
| 194 | if (include_default || !file_name.startsWith(default_index_prefix))
|
---|
| 195 | {
|
---|
| 196 | String prefix = file_name.substring(0, file_name.indexOf("."));
|
---|
| 197 | index_parts_set.add(prefix);
|
---|
| 198 | }
|
---|
| 199 | }
|
---|
| 200 | // we then turn the hashset into a nice list (in this case a deque)
|
---|
| 201 | ArrayDeque<String> index_parts = new ArrayDeque<String>();
|
---|
| 202 | Iterator<String> index_parts_iterator = index_parts_set.iterator();
|
---|
| 203 | while (index_parts_iterator.hasNext())
|
---|
| 204 | {
|
---|
| 205 | index_parts.add(index_parts_iterator.next());
|
---|
| 206 | }
|
---|
| 207 | return index_parts;
|
---|
| 208 | }
|
---|
| 209 | /** listIndexes(boolean) **/
|
---|
| 210 |
|
---|
| 211 | /**
|
---|
| 212 | */
|
---|
| 213 | public void loadManifest(Path manifest_path)
|
---|
| 214 | {
|
---|
| 215 | logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")");
|
---|
| 216 | try
|
---|
| 217 | {
|
---|
| 218 | BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
|
---|
| 219 | String line = "";
|
---|
| 220 | while ((line = manifest_reader.readLine()) != null)
|
---|
| 221 | {
|
---|
| 222 | this.file_list.add(line);
|
---|
| 223 | }
|
---|
| 224 | manifest_reader.close();
|
---|
| 225 | manifest_reader = null;
|
---|
| 226 | }
|
---|
| 227 | catch (IOException e)
|
---|
| 228 | {
|
---|
| 229 | logger.error("Exception when reading manifest! " + e);
|
---|
| 230 | }
|
---|
| 231 | }
|
---|
| 232 | /** loadManifest(Path) **/
|
---|
| 233 |
|
---|
| 234 | /** @function mergeIndexes
|
---|
| 235 | */
|
---|
| 236 | public boolean mergeIndexes(String prefix_one, String prefix_two)
|
---|
| 237 | {
|
---|
| 238 | logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
|
---|
| 239 | String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
| 240 | return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
|
---|
| 241 | }
|
---|
| 242 | /** mergeIndexes(String, String) **/
|
---|
| 243 |
|
---|
| 244 | /** @function mergeIndexes
|
---|
| 245 | */
|
---|
| 246 | public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
|
---|
| 247 | {
|
---|
| 248 | logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");
|
---|
| 249 | // init
|
---|
| 250 | String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
|
---|
| 251 | // use StructureMerger class
|
---|
| 252 | Index.setIndexLoadingProfileAsRetrieval(false);
|
---|
| 253 | Index index_one = Index.createIndex(index_path, prefix_one);
|
---|
| 254 | Index index_two = Index.createIndex(index_path, prefix_two);
|
---|
| 255 | Index index_out = Index.createNewIndex(index_path, prefix_out);
|
---|
| 256 | StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
|
---|
| 257 | structure_merger.mergeStructures();
|
---|
| 258 | structure_merger = null;
|
---|
| 259 | CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
|
---|
| 260 | logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
| 261 | logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
| 262 | logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
| 263 | logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
| 264 | collection_statistics = null;
|
---|
| 265 | try
|
---|
| 266 | {
|
---|
| 267 | index_one.close();
|
---|
| 268 | index_two.close();
|
---|
| 269 | index_out.close();
|
---|
| 270 | }
|
---|
| 271 | catch (IOException e)
|
---|
| 272 | {
|
---|
| 273 | logger.error("Exception while closing indexes: ", e);
|
---|
| 274 | return false;
|
---|
| 275 | }
|
---|
| 276 | index_one = null;
|
---|
| 277 | index_two = null;
|
---|
| 278 | index_out = null;
|
---|
| 279 | this.deleteIndex(prefix_one);
|
---|
| 280 | this.deleteIndex(prefix_two);
|
---|
| 281 | prefix_one = null;
|
---|
| 282 | prefix_two = null;
|
---|
| 283 | return true;
|
---|
| 284 | }
|
---|
| 285 | /** mergeIndexes(String, String, String) **/
|
---|
| 286 |
|
---|
| 287 | /** @function renameIndex
|
---|
| 288 | */
|
---|
| 289 | public void renameIndex(String prefix_in)
|
---|
| 290 | {
|
---|
| 291 | logger.info("FileIndexer::renameIndex(" + prefix_in + ")");
|
---|
| 292 | String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
| 293 | this.renameIndex(prefix_in, default_prefix);
|
---|
| 294 | }
|
---|
| 295 | /** renameIndex(String) **/
|
---|
| 296 |
|
---|
| 297 | /** @function renameIndex
|
---|
| 298 | */
|
---|
| 299 | public void renameIndex(String prefix_in, String prefix_out)
|
---|
| 300 | {
|
---|
| 301 | logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");
|
---|
| 302 | prefix_in = prefix_in + ".";
|
---|
| 303 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
| 304 | File files[] = index_path.listFiles();
|
---|
| 305 | for (int i = 0; i < files.length; i++)
|
---|
| 306 | {
|
---|
| 307 | File a_file = files[i];
|
---|
| 308 | String file_name = a_file.getName();
|
---|
| 309 | if (file_name.startsWith(prefix_in))
|
---|
| 310 | {
|
---|
| 311 | String suffix = file_name.substring(file_name.indexOf("."));
|
---|
| 312 | Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
|
---|
| 313 | try
|
---|
| 314 | {
|
---|
| 315 | Files.move(a_file.toPath(), target_path);
|
---|
| 316 | }
|
---|
| 317 | catch (IOException e)
|
---|
| 318 | {
|
---|
| 319 | System.err.println("Error! Failed to rename file: " + e);
|
---|
| 320 | }
|
---|
| 321 | target_path = null;
|
---|
| 322 | suffix = null;
|
---|
| 323 | }
|
---|
| 324 | file_name = null;
|
---|
| 325 | a_file = null;
|
---|
| 326 | }
|
---|
| 327 | files = null;
|
---|
| 328 | index_path = null;
|
---|
| 329 | }
|
---|
| 330 | /** renameIndex(String, String) **/
|
---|
| 331 |
|
---|
| 332 | /** @function runIndex
|
---|
| 333 | */
|
---|
| 334 | public void runIndex()
|
---|
| 335 | {
|
---|
| 336 | logger.info("FileIndexer::runIndex()");
|
---|
| 337 | if (this.file_list == null || this.file_list.size() == 0)
|
---|
| 338 | {
|
---|
| 339 | logger.error("No files specified to index. Aborting indexing process.");
|
---|
| 340 | return;
|
---|
| 341 | }
|
---|
| 342 |
|
---|
| 343 | try
|
---|
| 344 | {
|
---|
| 345 | // ensure the index directory exists
|
---|
| 346 | File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
|
---|
| 347 | if (!index_path.exists() && !index_path.mkdirs())
|
---|
| 348 | {
|
---|
| 349 | logger.error("Could not create the index folders at: "+ index_path);
|
---|
| 350 | logger.error("Aborting indexing process");
|
---|
| 351 | return;
|
---|
| 352 | }
|
---|
| 353 | this.deleteIndex(this.index_prefix);
|
---|
| 354 | // create the appropriate indexer
|
---|
| 355 | Indexer indexer;
|
---|
| 356 | final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
|
---|
| 357 | logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
|
---|
| 358 | if (use_single_pass)
|
---|
| 359 | {
|
---|
| 360 | indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
| 361 | }
|
---|
| 362 | else
|
---|
| 363 | {
|
---|
| 364 | indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
| 365 | }
|
---|
| 366 | // create the simple file collection object and hand it to the indexer
|
---|
| 367 | // for indexing
|
---|
| 368 | SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
|
---|
| 369 | indexer.index(new Collection[] { sfc });
|
---|
| 370 | Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
|
---|
| 371 | // output some indexing statistics
|
---|
| 372 | if (the_index != null)
|
---|
| 373 | {
|
---|
| 374 | CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
|
---|
| 375 | logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
|
---|
| 376 | logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
|
---|
| 377 | logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
|
---|
| 378 | logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
|
---|
| 379 | collection_statistics = null;
|
---|
| 380 | }
|
---|
| 381 | else
|
---|
| 382 | {
|
---|
| 383 | logger.warn("Nothing indexed!");
|
---|
| 384 | }
|
---|
| 385 | }
|
---|
| 386 | catch(Exception e)
|
---|
| 387 | {
|
---|
| 388 | logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
|
---|
| 389 | }
|
---|
| 390 | logger.info("FileIndexer::runIndex() - Complete!");
|
---|
| 391 | }
|
---|
| 392 | /** runIndex(List<String>) **/
|
---|
| 393 |
|
---|
| 394 | /** @function main
|
---|
| 395 | */
|
---|
| 396 | public static void main(String[] args)
|
---|
| 397 | {
|
---|
| 398 | System.out.println("================================= FileIndexer =================================");
|
---|
| 399 |
|
---|
| 400 | // 0. Initialization
|
---|
| 401 | final int PREPARE = 0;
|
---|
| 402 | final int INDEX = 1;
|
---|
| 403 | final int MERGE = 2;
|
---|
| 404 | int index_mode = PREPARE;
|
---|
| 405 | Path a_path = null;
|
---|
| 406 | String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
|
---|
| 407 | int batch_size = 0; // No limit
|
---|
| 408 | String divider = "===============================================================================";
|
---|
| 409 |
|
---|
| 410 | // 1. Parse arguments
|
---|
| 411 | // - first argument is the mode (prepare|index|merge)
|
---|
| 412 | if (args.length < 1)
|
---|
| 413 | {
|
---|
| 414 | System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]");
|
---|
| 415 | System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
|
---|
| 416 | System.out.println(" <FileIndexer> -merge");
|
---|
| 417 | System.out.println("where: paths *must* be absolute");
|
---|
| 418 | System.out.println(" when preparing 'path' is to the collection directory");
|
---|
| 419 | System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
|
---|
| 420 | System.out.println(" prefix is 'data' by default");
|
---|
| 421 | System.out.println(" batch_size is unlimited by default");
|
---|
| 422 | System.exit(0);
|
---|
| 423 | }
|
---|
| 424 | if (args[0].equals("-index"))
|
---|
| 425 | {
|
---|
| 426 | index_mode = INDEX;
|
---|
| 427 | }
|
---|
| 428 | else if (args[0].equals("-merge"))
|
---|
| 429 | {
|
---|
| 430 | index_mode = MERGE;
|
---|
| 431 | }
|
---|
| 432 | // - all other arguments
|
---|
| 433 | for (int argc = 1; (argc + 1) < args.length; argc += 2)
|
---|
| 434 | {
|
---|
| 435 | String key = args[argc];
|
---|
| 436 | String value = args[argc + 1];
|
---|
| 437 | if (key.equals("-batchsize"))
|
---|
| 438 | {
|
---|
| 439 | batch_size = Integer.parseInt(value);
|
---|
| 440 | }
|
---|
| 441 | else if (key.equals("-path"))
|
---|
| 442 | {
|
---|
| 443 | a_path = Paths.get(value);
|
---|
| 444 | }
|
---|
| 445 | else if (key.equals("-prefix"))
|
---|
| 446 | {
|
---|
| 447 | index_prefix = value;
|
---|
| 448 | }
|
---|
| 449 | else
|
---|
| 450 | {
|
---|
| 451 | System.err.println("Warning! Unknown argument: " + key);
|
---|
| 452 | }
|
---|
| 453 | }
|
---|
| 454 | // - check arguments
|
---|
| 455 | if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
|
---|
| 456 | {
|
---|
| 457 | System.err.println("Error! Required argument -path not set or invalid.");
|
---|
| 458 | System.err.println("");
|
---|
| 459 | System.exit(0);
|
---|
| 460 | }
|
---|
| 461 |
|
---|
| 462 | // 2. Create the file indexer
|
---|
| 463 | FileIndexer file_indexer = new FileIndexer(index_prefix);
|
---|
| 464 |
|
---|
| 465 | switch(index_mode)
|
---|
| 466 | {
|
---|
| 467 | case PREPARE:
|
---|
| 468 | // 3a. If we are in preparation mode, this is where we go through the
|
---|
| 469 | // files in the search path, adding those that are indexable by
|
---|
| 470 | // Terrier's current configuration into one of more manifest files
|
---|
| 471 | // based upon the value of batch_size.
|
---|
| 472 | System.out.println("Mode: Preparation");
|
---|
| 473 | System.out.println("Collection Path: " + a_path.toString());
|
---|
| 474 | if (batch_size != 0)
|
---|
| 475 | {
|
---|
| 476 | System.out.println("Batch Size: " + batch_size);
|
---|
| 477 | }
|
---|
| 478 | System.out.println(divider);
|
---|
| 479 | // - we're going to perform a breadth-first recursive search for files
|
---|
| 480 | // using the canIndex() function of the indexer to determine if this
|
---|
| 481 | // is (or would be) a file to index
|
---|
| 482 | ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
|
---|
| 483 | search_paths.add(a_path);
|
---|
| 484 | int file_count = 0;
|
---|
| 485 | int manifest_count = 0;
|
---|
| 486 | PrintWriter manifest_writer = null;
|
---|
| 487 | while (!search_paths.isEmpty())
|
---|
| 488 | {
|
---|
| 489 | Path search_path = search_paths.remove();
|
---|
| 490 | File files[] = search_path.toFile().listFiles();
|
---|
| 491 | for (int i = 0; i < files.length; i++)
|
---|
| 492 | {
|
---|
| 493 | Path file_path = files[i].toPath();
|
---|
| 494 | String file_name = file_path.getFileName().toString();
|
---|
| 495 | // skip . and ..
|
---|
| 496 | if (file_name.equals(".") || file_name.equals(".."))
|
---|
| 497 | {
|
---|
| 498 | // do nothing
|
---|
| 499 | }
|
---|
| 500 | // if this is a directory, add it to the queue of paths to search
|
---|
| 501 | else if (Files.isDirectory(file_path))
|
---|
| 502 | {
|
---|
| 503 | search_paths.add(file_path);
|
---|
| 504 | }
|
---|
| 505 | else
|
---|
| 506 | {
|
---|
| 507 | // would we expect to index this file?
|
---|
| 508 | if (file_indexer.canIndex(file_name))
|
---|
| 509 | {
|
---|
| 510 | // Ensure we opened a file handle for writing this manifest
|
---|
| 511 | if (manifest_writer == null)
|
---|
| 512 | {
|
---|
| 513 | Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
|
---|
| 514 | try
|
---|
| 515 | {
|
---|
| 516 | manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
|
---|
| 517 | }
|
---|
| 518 | catch (IOException e)
|
---|
| 519 | {
|
---|
| 520 | System.err.println("Error when opening manifest for writing! " + e);
|
---|
| 521 | }
|
---|
| 522 | System.out.println("* Writing to manifest: " + manifest_path.toString());
|
---|
| 523 | }
|
---|
| 524 | manifest_writer.println(file_path.toString());
|
---|
| 525 | file_count++;
|
---|
| 526 | // if we have a file limit, and we've found enough files...
|
---|
| 527 | if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
|
---|
| 528 | {
|
---|
| 529 | manifest_writer.close();
|
---|
| 530 | manifest_writer = null;
|
---|
| 531 | manifest_count++;
|
---|
| 532 | }
|
---|
| 533 | }
|
---|
| 534 | }
|
---|
| 535 | }
|
---|
| 536 | }
|
---|
| 537 | if (manifest_writer != null)
|
---|
| 538 | {
|
---|
| 539 | manifest_writer.close();
|
---|
| 540 | manifest_writer = null;
|
---|
| 541 | }
|
---|
| 542 | search_paths.clear();
|
---|
| 543 | search_paths = null;
|
---|
| 544 | break;
|
---|
| 545 |
|
---|
| 546 | case INDEX:
|
---|
| 547 | // 3b. Load the manifest specified by the search path and index files
|
---|
| 548 | System.out.println("Mode: Index");
|
---|
| 549 | System.out.println("Manifest Path: " + a_path.toString());
|
---|
| 550 | System.out.println("Prefix: " + index_prefix);
|
---|
| 551 | System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
|
---|
| 552 | System.out.println(divider);
|
---|
| 553 | file_indexer.loadManifest(a_path);
|
---|
| 554 | file_indexer.runIndex();
|
---|
| 555 | break;
|
---|
| 556 |
|
---|
| 557 | case MERGE:
|
---|
| 558 | System.out.println("Mode: Merge");
|
---|
| 559 | System.out.println(divider);
|
---|
| 560 | // 3c. User has requested a merging process.
|
---|
| 561 | logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");
|
---|
| 562 | // Look in the default index path and locate all of the indexes to merge
|
---|
| 563 | // (all those not prefixed 'data')
|
---|
| 564 | ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
|
---|
| 565 | // We merge the indexes two at a time, creating new intermediate indexes,
|
---|
| 566 | // while there are still more than two indexes left in the queue. We
|
---|
| 567 | // delete indexes that we have already merged.
|
---|
| 568 | int intermediate_file_counter = 0;
|
---|
| 569 | while (index_parts.size() > 2)
|
---|
| 570 | {
|
---|
| 571 | String index_part_one_prefix = index_parts.remove();
|
---|
| 572 | String index_part_two_prefix = index_parts.remove();
|
---|
| 573 | String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
|
---|
| 574 | logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
|
---|
| 575 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
|
---|
| 576 | index_part_one_prefix = null;
|
---|
| 577 | index_part_two_prefix = null;
|
---|
| 578 | // - add the intermediate index to the list of indexes to be merged
|
---|
| 579 | index_parts.add(intermediate_index_prefix);
|
---|
| 580 | // - and increment the intermediate file count for the next iteration
|
---|
| 581 | intermediate_file_counter++;
|
---|
| 582 | }
|
---|
| 583 | // If there are two left we merge them into the final index
|
---|
| 584 | if (index_parts.size() == 2)
|
---|
| 585 | {
|
---|
| 586 | String index_part_one_prefix = index_parts.remove();
|
---|
| 587 | String index_part_two_prefix = index_parts.remove();
|
---|
| 588 | logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
|
---|
| 589 | file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
|
---|
| 590 | index_part_one_prefix = null;
|
---|
| 591 | index_part_two_prefix = null;
|
---|
| 592 | }
|
---|
| 593 | // Otherwise we just rename the files that are there into the final
|
---|
| 594 | // index
|
---|
| 595 | else if (index_parts.size() == 1)
|
---|
| 596 | {
|
---|
| 597 | String index_part_prefix = index_parts.remove();
|
---|
| 598 | logger.info("Renaming index \"" + index_part_prefix + "\" into final index");
|
---|
| 599 | file_indexer.renameIndex(index_part_prefix);
|
---|
| 600 | index_part_prefix = null;
|
---|
| 601 | }
|
---|
| 602 | // no index parts detected - was merge run by mistake?
|
---|
| 603 | else
|
---|
| 604 | {
|
---|
| 605 | logger.warn("No index parts detected - merge has no effect");
|
---|
| 606 | }
|
---|
| 607 | // clean up
|
---|
| 608 | index_parts.clear();
|
---|
| 609 | index_parts = null;
|
---|
| 610 | logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!");
|
---|
| 611 | break;
|
---|
| 612 |
|
---|
| 613 | default:
|
---|
| 614 | logger.warn("Unknown FileIndexer mode requested");
|
---|
| 615 | }
|
---|
| 616 |
|
---|
| 617 | // 4. Finish indexing
|
---|
| 618 | file_indexer.close();
|
---|
| 619 | file_indexer = null;
|
---|
| 620 |
|
---|
| 621 | // 5. Complete!
|
---|
| 622 | System.out.println(divider);
|
---|
| 623 | System.out.println("Complete!");
|
---|
| 624 | System.out.println(divider);
|
---|
| 625 |
|
---|
| 626 | System.out.println("");
|
---|
| 627 | }
|
---|
| 628 | /** main() **/
|
---|
| 629 | } |
---|