source: gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java@ 26237

Last change on this file since 26237 was 26237, checked in by jmt12, 12 years ago

Allows the specification of a maxfiles - limiting the number of files written to manifests and hence processed by Terrier

File size: 23.6 KB
Line 
1package org.terrier.applications;
2
3import java.io.BufferedReader;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileOutputStream;
8import java.io.InputStreamReader;
9import java.io.IOException;
10import java.io.PrintWriter;
11import java.nio.file.Files;
12import java.nio.file.Path;
13import java.nio.file.Paths;
14import java.util.ArrayDeque;
15import java.util.ArrayList;
16import java.util.Arrays;
17import java.util.Collections;
18import java.util.List;
19import java.util.HashSet;
20import java.util.Iterator;
21
22import org.apache.log4j.Logger;
23
24import org.terrier.indexing.BlockIndexer;
25import org.terrier.indexing.BlockSinglePassIndexer;
26import org.terrier.indexing.Collection;
27import org.terrier.indexing.Indexer;
28import org.terrier.indexing.SimpleFileCollection;
29import org.terrier.structures.CollectionStatistics;
30import org.terrier.structures.Index;
31import org.terrier.structures.merging.StructureMerger;
32import org.terrier.utility.ApplicationSetup;
33
34/** @class FileIndexer
35 *
36 * A simple indexer, based on DesktopTerrier, that is intended to build a
37 * searchable index from the contents of a directory in the file system. This
38 * indexer has been created with parallel processing in mind, in that you can
39 * split a large or computationally complex directory into several batches
40 * (the contents of which are recorded by manifest files) and then process
41 * each batch independently (and in parallel on a multicore machine or a
42 * cluster). Once this is complete you can then merge each pair of indexes
43 * together - eventually generating a final index encompassing all of the
44 * batches parts.
45 *
46 * Start by preparing the collection for indexing by locating indexable files
47 * and recording them in one or more manifest files, i.e.:
48 *
49 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
50 * -path /data/mycollection/ -batchsize 10
51 *
52 * You then build the uniquely named index (in this case with a prefix of
53 * '000') for each manifest, i.e.:
54 *
55 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
56 * -path /terrier/var/manifest-000.spec -prefix 000
57 *
58 * If there are more index (for example, with prefixes '000' and '001'), you
59 * can then merge them (into a single index with the prefix 'data') using this
60 * command:
61 *
62 * ./bin/anyclass.sh \
63 * org.terrier.applications.structures.merging.BlockStructureMerger \
64 * /terrier/var/index/ 000 /terrier/var/index/ 001 \
65 * /terrier/var/index/ data
66 *
67 */
68public class FileIndexer
69{
70 private String index_prefix;
71
72 private HashSet<String> supported_extensions;
73
74 private List<String> file_list;
75
76 protected static final Logger logger = Logger.getLogger(FileIndexer.class);
77
78 /** @function FileIndexer
79 * Default constructor
80 */
81 public FileIndexer(String index_prefix)
82 {
83 logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
84 this.index_prefix = index_prefix;
85 this.supported_extensions = new HashSet<String>();
86 this.file_list = new ArrayList<String>();
87
88 //setting properties for the application
89 if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
90 {
91 ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
92 ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
93 }
94 ApplicationSetup.setProperty("indexing.max.tokens", "10000");
95 ApplicationSetup.setProperty("invertedfile.processterms","25000");
96 ApplicationSetup.setProperty("ignore.low.idf.terms","false");
97 ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
98
99 // we need to know what extensions are supported by the indexer
100 String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
101 // parse up the string, and store the extensions (only) in the HashSet
102 if (extension_parsers_raw.length() > 0)
103 {
104 String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
105 // for each parser, of the form: <ext>:<documentclass>
106 for (int i = 0; i < extension_parsers.length; i++)
107 {
108 String extension_parser = extension_parsers[i];
109 String[] mapping = extension_parser.split(":");
110 // a well-formed mapping has an extension and a document class
111 if (mapping.length == 2)
112 {
113 // we store just the extension, leaving the actual mapping up to
114 // SimpleFileCollection
115 supported_extensions.add(mapping[0]);
116 }
117 else
118 {
119 logger.warn("Malformed extension parser mapping: " + extension_parser);
120 }
121 }
122 }
123 else
124 {
125 logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
126 }
127 }
128 /** FileIndexer() **/
129
130 /** @function canIndex
131 * Given a path to a file, determine if Terrier's current configuration
132 * allows that file to be indexed
133 */
134 public boolean canIndex(String file_path)
135 {
136 logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");
137 // we may have specified a default Document class for all files encountered
138 if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
139 {
140 return true;
141 }
142 // otherwise, check through the list of supported file extensions to see
143 // if one matches this file
144 // - get the file's extension
145 String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
146 // - see if it exists in the array of supported extensions
147 return this.supported_extensions.contains(file_extension);
148 }
149 /** canIndex(String) **/
150
151 /** @function close
152 */
153 public void close()
154 {
155 logger.info("FileIndexer::close()");
156 this.file_list.clear();
157 this.file_list = null;
158 this.supported_extensions.clear();
159 this.supported_extensions = null;
160 }
161 /** close() **/
162
163 /** @function deleteIndex
164 */
165 public void deleteIndex(String prefix)
166 {
167 logger.info("FileIndexer::deleteIndex(" + prefix + ")");
168 // and that all old files for this index are removed
169 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
170 File files[] = index_path.listFiles();
171 for (int i = 0; i < files.length; i++)
172 {
173 String file_name = files[i].getName();
174 if (file_name.startsWith(prefix))
175 {
176 files[i].delete();
177 }
178 }
179 }
180 /** deleteIndex(String) **/
181
182 /** @function listIndexes
183 */
184 public ArrayDeque<String> listIndexes(boolean include_default)
185 {
186 logger.info("FileIndexer::listIndexes(" + include_default + ")");
187 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
188 String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
189 // we start by populating a set structure to ensure each prefix only occurs
190 // once
191 HashSet<String> index_parts_set = new HashSet<String>();
192 File files[] = index_path.listFiles();
193 for (int i = 0; i < files.length; i++)
194 {
195 String file_name = files[i].getName();
196 if (include_default || !file_name.startsWith(default_index_prefix))
197 {
198 String prefix = file_name.substring(0, file_name.indexOf("."));
199 index_parts_set.add(prefix);
200 }
201 }
202 // we then turn the hashset into a nice list (in this case a deque)
203 ArrayDeque<String> index_parts = new ArrayDeque<String>();
204 Iterator<String> index_parts_iterator = index_parts_set.iterator();
205 while (index_parts_iterator.hasNext())
206 {
207 index_parts.add(index_parts_iterator.next());
208 }
209 return index_parts;
210 }
211 /** listIndexes(boolean) **/
212
213 /**
214 */
215 public void loadManifest(Path manifest_path)
216 {
217 logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")");
218 try
219 {
220 BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
221 String line = "";
222 while ((line = manifest_reader.readLine()) != null)
223 {
224 this.file_list.add(line);
225 }
226 manifest_reader.close();
227 manifest_reader = null;
228 }
229 catch (IOException e)
230 {
231 logger.error("Exception when reading manifest! " + e);
232 }
233 }
234 /** loadManifest(Path) **/
235
236 /** @function mergeIndexes
237 */
238 public boolean mergeIndexes(String prefix_one, String prefix_two)
239 {
240 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
241 String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
242 return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
243 }
244 /** mergeIndexes(String, String) **/
245
246 /** @function mergeIndexes
247 */
248 public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
249 {
250 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");
251 // init
252 String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
253 // use StructureMerger class
254 Index.setIndexLoadingProfileAsRetrieval(false);
255 Index index_one = Index.createIndex(index_path, prefix_one);
256 Index index_two = Index.createIndex(index_path, prefix_two);
257 Index index_out = Index.createNewIndex(index_path, prefix_out);
258 StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
259 structure_merger.mergeStructures();
260 structure_merger = null;
261 CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
262 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
263 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
264 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
265 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
266 collection_statistics = null;
267 try
268 {
269 index_one.close();
270 index_two.close();
271 index_out.close();
272 }
273 catch (IOException e)
274 {
275 logger.error("Exception while closing indexes: ", e);
276 return false;
277 }
278 index_one = null;
279 index_two = null;
280 index_out = null;
281 this.deleteIndex(prefix_one);
282 this.deleteIndex(prefix_two);
283 prefix_one = null;
284 prefix_two = null;
285 return true;
286 }
287 /** mergeIndexes(String, String, String) **/
288
289 /** @function renameIndex
290 */
291 public void renameIndex(String prefix_in)
292 {
293 logger.info("FileIndexer::renameIndex(" + prefix_in + ")");
294 String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
295 this.renameIndex(prefix_in, default_prefix);
296 }
297 /** renameIndex(String) **/
298
299 /** @function renameIndex
300 */
301 public void renameIndex(String prefix_in, String prefix_out)
302 {
303 logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");
304 prefix_in = prefix_in + ".";
305 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
306 File files[] = index_path.listFiles();
307 for (int i = 0; i < files.length; i++)
308 {
309 File a_file = files[i];
310 String file_name = a_file.getName();
311 if (file_name.startsWith(prefix_in))
312 {
313 String suffix = file_name.substring(file_name.indexOf("."));
314 Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
315 try
316 {
317 Files.move(a_file.toPath(), target_path);
318 }
319 catch (IOException e)
320 {
321 System.err.println("Error! Failed to rename file: " + e);
322 }
323 target_path = null;
324 suffix = null;
325 }
326 file_name = null;
327 a_file = null;
328 }
329 files = null;
330 index_path = null;
331 }
332 /** renameIndex(String, String) **/
333
334 /** @function runIndex
335 */
336 public void runIndex()
337 {
338 logger.info("FileIndexer::runIndex()");
339 if (this.file_list == null || this.file_list.size() == 0)
340 {
341 logger.error("No files specified to index. Aborting indexing process.");
342 return;
343 }
344
345 try
346 {
347 // ensure the index directory exists
348 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
349 if (!index_path.exists() && !index_path.mkdirs())
350 {
351 logger.error("Could not create the index folders at: "+ index_path);
352 logger.error("Aborting indexing process");
353 return;
354 }
355 this.deleteIndex(this.index_prefix);
356 // create the appropriate indexer
357 Indexer indexer;
358 final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
359 logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
360 if (use_single_pass)
361 {
362 indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
363 }
364 else
365 {
366 indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
367 }
368 // create the simple file collection object and hand it to the indexer
369 // for indexing
370 SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
371 indexer.index(new Collection[] { sfc });
372 Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
373 // output some indexing statistics
374 if (the_index != null)
375 {
376 CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
377 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
378 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
379 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
380 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
381 collection_statistics = null;
382 }
383 else
384 {
385 logger.warn("Nothing indexed!");
386 }
387 }
388 catch(Exception e)
389 {
390 logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
391 }
392 logger.info("FileIndexer::runIndex() - Complete!");
393 }
394 /** runIndex(List<String>) **/
395
396 /** @function main
397 */
398 public static void main(String[] args)
399 {
400 System.out.println("================================= FileIndexer =================================");
401
402 // 0. Initialization
403 final int PREPARE = 0;
404 final int INDEX = 1;
405 final int MERGE = 2;
406 int index_mode = PREPARE;
407 Path a_path = null;
408 String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
409 int batch_size = 0; // No limit
410 int max_files = 0; // No limit
411 String divider = "===============================================================================";
412
413 // 1. Parse arguments
414 // - first argument is the mode (prepare|index|merge)
415 if (args.length < 1)
416 {
417 System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]");
418 System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
419 System.out.println(" <FileIndexer> -merge");
420 System.out.println("where: paths *must* be absolute");
421 System.out.println(" when preparing 'path' is to the collection directory");
422 System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
423 System.out.println(" prefix is 'data' by default");
424 System.out.println(" batch_size is unlimited by default");
425 System.exit(0);
426 }
427 if (args[0].equals("-index"))
428 {
429 index_mode = INDEX;
430 }
431 else if (args[0].equals("-merge"))
432 {
433 index_mode = MERGE;
434 }
435 // - all other arguments
436 for (int argc = 1; (argc + 1) < args.length; argc += 2)
437 {
438 String key = args[argc];
439 String value = args[argc + 1];
440 if (key.equals("-batchsize"))
441 {
442 batch_size = Integer.parseInt(value);
443 }
444 else if (key.equals("-maxfiles"))
445 {
446 max_files = Integer.parseInt(value);
447 }
448 else if (key.equals("-path"))
449 {
450 a_path = Paths.get(value);
451 }
452 else if (key.equals("-prefix"))
453 {
454 index_prefix = value;
455 }
456 else
457 {
458 System.err.println("Warning! Unknown argument: " + key);
459 }
460 }
461 // - check arguments
462 if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
463 {
464 System.err.println("Error! Required argument -path not set or invalid.");
465 System.err.println("");
466 System.exit(0);
467 }
468
469 // 2. Create the file indexer
470 FileIndexer file_indexer = new FileIndexer(index_prefix);
471
472 switch(index_mode)
473 {
474 case PREPARE:
475 // 3a. If we are in preparation mode, this is where we go through the
476 // files in the search path, adding those that are indexable by
477 // Terrier's current configuration into one of more manifest files
478 // based upon the value of batch_size.
479 System.out.println("Mode: Preparation");
480 System.out.println("Collection Path: " + a_path.toString());
481 if (batch_size != 0)
482 {
483 System.out.println("Batch Size: " + batch_size);
484 }
485 if (max_files != 0)
486 {
487 System.out.println("Max Files: " + max_files);
488 }
489 System.out.println(divider);
490 // - we're going to perform a breadth-first recursive search for files
491 // using the canIndex() function of the indexer to determine if this
492 // is (or would be) a file to index
493 ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
494 search_paths.add(a_path);
495 int file_count = 0;
496 int total_file_count = 0; // Used in conjunction with max_files
497 int manifest_count = 0;
498 boolean have_max_files = false;
499 PrintWriter manifest_writer = null;
500 while (!search_paths.isEmpty() && !have_max_files)
501 {
502 Path search_path = search_paths.remove();
503 File files_raw[] = search_path.toFile().listFiles();
504 // randomize file order
505 List<File> files = Arrays.asList(files_raw);
506 Collections.shuffle(files);
507 // iterate through files filling manifests
508 for (int i = 0; i < files.size() && !have_max_files; i++)
509 {
510 Path file_path = files.get(i).toPath();
511 String file_name = file_path.getFileName().toString();
512 // skip . and ..
513 if (file_name.equals(".") || file_name.equals(".."))
514 {
515 // do nothing
516 }
517 // if this is a directory, add it to the queue of paths to search
518 else if (Files.isDirectory(file_path))
519 {
520 search_paths.add(file_path);
521 }
522 else
523 {
524 // would we expect to index this file?
525 if (file_indexer.canIndex(file_name))
526 {
527 // Ensure we opened a file handle for writing this manifest
528 if (manifest_writer == null)
529 {
530 Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
531 try
532 {
533 manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
534 }
535 catch (IOException e)
536 {
537 System.err.println("Error when opening manifest for writing! " + e);
538 }
539 System.out.println("* Writing to manifest: " + manifest_path.toString());
540 }
541 manifest_writer.println(file_path.toString());
542 file_count++;
543 total_file_count++;
544 // if we have a batch size, and we've found enough files...
545 if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
546 {
547 manifest_writer.close();
548 manifest_writer = null;
549 manifest_count++;
550 file_count = 0;
551 }
552 // if we have a max files limit and we've done enough files, then
553 // break out of for and while loops
554 if (max_files > 0 && file_count >= max_files)
555 {
556 have_max_files = true;
557 }
558 }
559 }
560 }
561 }
562 if (manifest_writer != null)
563 {
564 manifest_writer.close();
565 manifest_writer = null;
566 }
567 search_paths.clear();
568 search_paths = null;
569 break;
570
571 case INDEX:
572 // 3b. Load the manifest specified by the search path and index files
573 System.out.println("Mode: Index");
574 System.out.println("Manifest Path: " + a_path.toString());
575 System.out.println("Prefix: " + index_prefix);
576 System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
577 System.out.println(divider);
578 file_indexer.loadManifest(a_path);
579 file_indexer.runIndex();
580 break;
581
582 case MERGE:
583 System.out.println("Mode: Merge");
584 System.out.println(divider);
585 // 3c. User has requested a merging process.
586 logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");
587 // Look in the default index path and locate all of the indexes to merge
588 // (all those not prefixed 'data')
589 ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
590 // We merge the indexes two at a time, creating new intermediate indexes,
591 // while there are still more than two indexes left in the queue. We
592 // delete indexes that we have already merged.
593 int intermediate_file_counter = 0;
594 while (index_parts.size() > 2)
595 {
596 String index_part_one_prefix = index_parts.remove();
597 String index_part_two_prefix = index_parts.remove();
598 String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
599 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
600 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
601 index_part_one_prefix = null;
602 index_part_two_prefix = null;
603 // - add the intermediate index to the list of indexes to be merged
604 index_parts.add(intermediate_index_prefix);
605 // - and increment the intermediate file count for the next iteration
606 intermediate_file_counter++;
607 }
608 // If there are two left we merge them into the final index
609 if (index_parts.size() == 2)
610 {
611 String index_part_one_prefix = index_parts.remove();
612 String index_part_two_prefix = index_parts.remove();
613 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
614 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
615 index_part_one_prefix = null;
616 index_part_two_prefix = null;
617 }
618 // Otherwise we just rename the files that are there into the final
619 // index
620 else if (index_parts.size() == 1)
621 {
622 String index_part_prefix = index_parts.remove();
623 logger.info("Renaming index \"" + index_part_prefix + "\" into final index");
624 file_indexer.renameIndex(index_part_prefix);
625 index_part_prefix = null;
626 }
627 // no index parts detected - was merge run by mistake?
628 else
629 {
630 logger.warn("No index parts detected - merge has no effect");
631 }
632 // clean up
633 index_parts.clear();
634 index_parts = null;
635 logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!");
636 break;
637
638 default:
639 logger.warn("Unknown FileIndexer mode requested");
640 }
641
642 // 4. Finish indexing
643 file_indexer.close();
644 file_indexer = null;
645
646 // 5. Complete!
647 System.out.println(divider);
648 System.out.println("Complete!");
649 System.out.println(divider);
650
651 System.out.println("");
652 }
653 /** main() **/
654}
Note: See TracBrowser for help on using the repository browser.