source: gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java@ 26187

Last change on this file since 26187 was 26187, checked in by jmt12, 12 years ago

Adding the rest of parallel processing support for Terrier into SVN. You've got the new simple file indexer java source code, and a nice wrapper script to make launching parallel Terrier ingests easy as.

File size: 22.7 KB
Line 
1package org.terrier.applications;
2
3import java.io.BufferedReader;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileOutputStream;
8import java.io.InputStreamReader;
9import java.io.IOException;
10import java.io.PrintWriter;
11import java.nio.file.Files;
12import java.nio.file.Path;
13import java.nio.file.Paths;
14import java.util.ArrayDeque;
15import java.util.ArrayList;
16import java.util.List;
17import java.util.HashSet;
18import java.util.Iterator;
19
20import org.apache.log4j.Logger;
21
22import org.terrier.indexing.BlockIndexer;
23import org.terrier.indexing.BlockSinglePassIndexer;
24import org.terrier.indexing.Collection;
25import org.terrier.indexing.Indexer;
26import org.terrier.indexing.SimpleFileCollection;
27import org.terrier.structures.CollectionStatistics;
28import org.terrier.structures.Index;
29import org.terrier.structures.merging.StructureMerger;
30import org.terrier.utility.ApplicationSetup;
31
32/** @class FileIndexer
33 *
34 * A simple indexer, based on DesktopTerrier, that is intended to build a
35 * searchable index from the contents of a directory in the file system. This
36 * indexer has been created with parallel processing in mind, in that you can
37 * split a large or computationally complex directory into several batches
38 * (the contents of which are recorded by manifest files) and then process
39 * each batch independently (and in parallel on a multicore machine or a
40 * cluster). Once this is complete you can then merge each pair of indexes
41 * together - eventually generating a final index encompassing all of the
42 * batches parts.
43 *
44 * Start by preparing the collection for indexing by locating indexable files
45 * and recording them in one or more manifest files, i.e.:
46 *
47 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
48 * -path /data/mycollection/ -batchsize 10
49 *
50 * You then build the uniquely named index (in this case with a prefix of
51 * '000') for each manifest, i.e.:
52 *
53 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
54 * -path /terrier/var/manifest-000.spec -prefix 000
55 *
56 * If there are more index (for example, with prefixes '000' and '001'), you
57 * can then merge them (into a single index with the prefix 'data') using this
58 * command:
59 *
60 * ./bin/anyclass.sh \
61 * org.terrier.applications.structures.merging.BlockStructureMerger \
62 * /terrier/var/index/ 000 /terrier/var/index/ 001 \
63 * /terrier/var/index/ data
64 *
65 */
66public class FileIndexer
67{
68 private String index_prefix;
69
70 private HashSet<String> supported_extensions;
71
72 private List<String> file_list;
73
74 protected static final Logger logger = Logger.getLogger(FileIndexer.class);
75
76 /** @function FileIndexer
77 * Default constructor
78 */
79 public FileIndexer(String index_prefix)
80 {
81 logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
82 this.index_prefix = index_prefix;
83 this.supported_extensions = new HashSet<String>();
84 this.file_list = new ArrayList<String>();
85
86 //setting properties for the application
87 if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
88 {
89 ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
90 ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
91 }
92 ApplicationSetup.setProperty("indexing.max.tokens", "10000");
93 ApplicationSetup.setProperty("invertedfile.processterms","25000");
94 ApplicationSetup.setProperty("ignore.low.idf.terms","false");
95 ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
96
97 // we need to know what extensions are supported by the indexer
98 String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
99 // parse up the string, and store the extensions (only) in the HashSet
100 if (extension_parsers_raw.length() > 0)
101 {
102 String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
103 // for each parser, of the form: <ext>:<documentclass>
104 for (int i = 0; i < extension_parsers.length; i++)
105 {
106 String extension_parser = extension_parsers[i];
107 String[] mapping = extension_parser.split(":");
108 // a well-formed mapping has an extension and a document class
109 if (mapping.length == 2)
110 {
111 // we store just the extension, leaving the actual mapping up to
112 // SimpleFileCollection
113 supported_extensions.add(mapping[0]);
114 }
115 else
116 {
117 logger.warn("Malformed extension parser mapping: " + extension_parser);
118 }
119 }
120 }
121 else
122 {
123 logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
124 }
125 }
126 /** FileIndexer() **/
127
128 /** @function canIndex
129 * Given a path to a file, determine if Terrier's current configuration
130 * allows that file to be indexed
131 */
132 public boolean canIndex(String file_path)
133 {
134 logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");
135 // we may have specified a default Document class for all files encountered
136 if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
137 {
138 return true;
139 }
140 // otherwise, check through the list of supported file extensions to see
141 // if one matches this file
142 // - get the file's extension
143 String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
144 // - see if it exists in the array of supported extensions
145 return this.supported_extensions.contains(file_extension);
146 }
147 /** canIndex(String) **/
148
149 /** @function close
150 */
151 public void close()
152 {
153 logger.info("FileIndexer::close()");
154 this.file_list.clear();
155 this.file_list = null;
156 this.supported_extensions.clear();
157 this.supported_extensions = null;
158 }
159 /** close() **/
160
161 /** @function deleteIndex
162 */
163 public void deleteIndex(String prefix)
164 {
165 logger.info("FileIndexer::deleteIndex(" + prefix + ")");
166 // and that all old files for this index are removed
167 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
168 File files[] = index_path.listFiles();
169 for (int i = 0; i < files.length; i++)
170 {
171 String file_name = files[i].getName();
172 if (file_name.startsWith(prefix))
173 {
174 files[i].delete();
175 }
176 }
177 }
178 /** deleteIndex(String) **/
179
180 /** @function listIndexes
181 */
182 public ArrayDeque<String> listIndexes(boolean include_default)
183 {
184 logger.info("FileIndexer::listIndexes(" + include_default + ")");
185 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
186 String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
187 // we start by populating a set structure to ensure each prefix only occurs
188 // once
189 HashSet<String> index_parts_set = new HashSet<String>();
190 File files[] = index_path.listFiles();
191 for (int i = 0; i < files.length; i++)
192 {
193 String file_name = files[i].getName();
194 if (include_default || !file_name.startsWith(default_index_prefix))
195 {
196 String prefix = file_name.substring(0, file_name.indexOf("."));
197 index_parts_set.add(prefix);
198 }
199 }
200 // we then turn the hashset into a nice list (in this case a deque)
201 ArrayDeque<String> index_parts = new ArrayDeque<String>();
202 Iterator<String> index_parts_iterator = index_parts_set.iterator();
203 while (index_parts_iterator.hasNext())
204 {
205 index_parts.add(index_parts_iterator.next());
206 }
207 return index_parts;
208 }
209 /** listIndexes(boolean) **/
210
211 /**
212 */
213 public void loadManifest(Path manifest_path)
214 {
215 logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")");
216 try
217 {
218 BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
219 String line = "";
220 while ((line = manifest_reader.readLine()) != null)
221 {
222 this.file_list.add(line);
223 }
224 manifest_reader.close();
225 manifest_reader = null;
226 }
227 catch (IOException e)
228 {
229 logger.error("Exception when reading manifest! " + e);
230 }
231 }
232 /** loadManifest(Path) **/
233
234 /** @function mergeIndexes
235 */
236 public boolean mergeIndexes(String prefix_one, String prefix_two)
237 {
238 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
239 String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
240 return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
241 }
242 /** mergeIndexes(String, String) **/
243
244 /** @function mergeIndexes
245 */
246 public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
247 {
248 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");
249 // init
250 String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
251 // use StructureMerger class
252 Index.setIndexLoadingProfileAsRetrieval(false);
253 Index index_one = Index.createIndex(index_path, prefix_one);
254 Index index_two = Index.createIndex(index_path, prefix_two);
255 Index index_out = Index.createNewIndex(index_path, prefix_out);
256 StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
257 structure_merger.mergeStructures();
258 structure_merger = null;
259 CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
260 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
261 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
262 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
263 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
264 collection_statistics = null;
265 try
266 {
267 index_one.close();
268 index_two.close();
269 index_out.close();
270 }
271 catch (IOException e)
272 {
273 logger.error("Exception while closing indexes: ", e);
274 return false;
275 }
276 index_one = null;
277 index_two = null;
278 index_out = null;
279 this.deleteIndex(prefix_one);
280 this.deleteIndex(prefix_two);
281 prefix_one = null;
282 prefix_two = null;
283 return true;
284 }
285 /** mergeIndexes(String, String, String) **/
286
287 /** @function renameIndex
288 */
289 public void renameIndex(String prefix_in)
290 {
291 logger.info("FileIndexer::renameIndex(" + prefix_in + ")");
292 String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
293 this.renameIndex(prefix_in, default_prefix);
294 }
295 /** renameIndex(String) **/
296
297 /** @function renameIndex
298 */
299 public void renameIndex(String prefix_in, String prefix_out)
300 {
301 logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");
302 prefix_in = prefix_in + ".";
303 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
304 File files[] = index_path.listFiles();
305 for (int i = 0; i < files.length; i++)
306 {
307 File a_file = files[i];
308 String file_name = a_file.getName();
309 if (file_name.startsWith(prefix_in))
310 {
311 String suffix = file_name.substring(file_name.indexOf("."));
312 Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
313 try
314 {
315 Files.move(a_file.toPath(), target_path);
316 }
317 catch (IOException e)
318 {
319 System.err.println("Error! Failed to rename file: " + e);
320 }
321 target_path = null;
322 suffix = null;
323 }
324 file_name = null;
325 a_file = null;
326 }
327 files = null;
328 index_path = null;
329 }
330 /** renameIndex(String, String) **/
331
332 /** @function runIndex
333 */
334 public void runIndex()
335 {
336 logger.info("FileIndexer::runIndex()");
337 if (this.file_list == null || this.file_list.size() == 0)
338 {
339 logger.error("No files specified to index. Aborting indexing process.");
340 return;
341 }
342
343 try
344 {
345 // ensure the index directory exists
346 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
347 if (!index_path.exists() && !index_path.mkdirs())
348 {
349 logger.error("Could not create the index folders at: "+ index_path);
350 logger.error("Aborting indexing process");
351 return;
352 }
353 this.deleteIndex(this.index_prefix);
354 // create the appropriate indexer
355 Indexer indexer;
356 final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
357 logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
358 if (use_single_pass)
359 {
360 indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
361 }
362 else
363 {
364 indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
365 }
366 // create the simple file collection object and hand it to the indexer
367 // for indexing
368 SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
369 indexer.index(new Collection[] { sfc });
370 Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
371 // output some indexing statistics
372 if (the_index != null)
373 {
374 CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
375 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
376 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
377 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
378 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
379 collection_statistics = null;
380 }
381 else
382 {
383 logger.warn("Nothing indexed!");
384 }
385 }
386 catch(Exception e)
387 {
388 logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
389 }
390 logger.info("FileIndexer::runIndex() - Complete!");
391 }
392 /** runIndex(List<String>) **/
393
394 /** @function main
395 */
396 public static void main(String[] args)
397 {
398 System.out.println("================================= FileIndexer =================================");
399
400 // 0. Initialization
401 final int PREPARE = 0;
402 final int INDEX = 1;
403 final int MERGE = 2;
404 int index_mode = PREPARE;
405 Path a_path = null;
406 String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
407 int batch_size = 0; // No limit
408 String divider = "===============================================================================";
409
410 // 1. Parse arguments
411 // - first argument is the mode (prepare|index|merge)
412 if (args.length < 1)
413 {
414 System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]");
415 System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
416 System.out.println(" <FileIndexer> -merge");
417 System.out.println("where: paths *must* be absolute");
418 System.out.println(" when preparing 'path' is to the collection directory");
419 System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
420 System.out.println(" prefix is 'data' by default");
421 System.out.println(" batch_size is unlimited by default");
422 System.exit(0);
423 }
424 if (args[0].equals("-index"))
425 {
426 index_mode = INDEX;
427 }
428 else if (args[0].equals("-merge"))
429 {
430 index_mode = MERGE;
431 }
432 // - all other arguments
433 for (int argc = 1; (argc + 1) < args.length; argc += 2)
434 {
435 String key = args[argc];
436 String value = args[argc + 1];
437 if (key.equals("-batchsize"))
438 {
439 batch_size = Integer.parseInt(value);
440 }
441 else if (key.equals("-path"))
442 {
443 a_path = Paths.get(value);
444 }
445 else if (key.equals("-prefix"))
446 {
447 index_prefix = value;
448 }
449 else
450 {
451 System.err.println("Warning! Unknown argument: " + key);
452 }
453 }
454 // - check arguments
455 if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
456 {
457 System.err.println("Error! Required argument -path not set or invalid.");
458 System.err.println("");
459 System.exit(0);
460 }
461
462 // 2. Create the file indexer
463 FileIndexer file_indexer = new FileIndexer(index_prefix);
464
465 switch(index_mode)
466 {
467 case PREPARE:
468 // 3a. If we are in preparation mode, this is where we go through the
469 // files in the search path, adding those that are indexable by
470 // Terrier's current configuration into one of more manifest files
471 // based upon the value of batch_size.
472 System.out.println("Mode: Preparation");
473 System.out.println("Collection Path: " + a_path.toString());
474 if (batch_size != 0)
475 {
476 System.out.println("Batch Size: " + batch_size);
477 }
478 System.out.println(divider);
479 // - we're going to perform a breadth-first recursive search for files
480 // using the canIndex() function of the indexer to determine if this
481 // is (or would be) a file to index
482 ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
483 search_paths.add(a_path);
484 int file_count = 0;
485 int manifest_count = 0;
486 PrintWriter manifest_writer = null;
487 while (!search_paths.isEmpty())
488 {
489 Path search_path = search_paths.remove();
490 File files[] = search_path.toFile().listFiles();
491 for (int i = 0; i < files.length; i++)
492 {
493 Path file_path = files[i].toPath();
494 String file_name = file_path.getFileName().toString();
495 // skip . and ..
496 if (file_name.equals(".") || file_name.equals(".."))
497 {
498 // do nothing
499 }
500 // if this is a directory, add it to the queue of paths to search
501 else if (Files.isDirectory(file_path))
502 {
503 search_paths.add(file_path);
504 }
505 else
506 {
507 // would we expect to index this file?
508 if (file_indexer.canIndex(file_name))
509 {
510 // Ensure we opened a file handle for writing this manifest
511 if (manifest_writer == null)
512 {
513 Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
514 try
515 {
516 manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
517 }
518 catch (IOException e)
519 {
520 System.err.println("Error when opening manifest for writing! " + e);
521 }
522 System.out.println("* Writing to manifest: " + manifest_path.toString());
523 }
524 manifest_writer.println(file_path.toString());
525 file_count++;
526 // if we have a file limit, and we've found enough files...
527 if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
528 {
529 manifest_writer.close();
530 manifest_writer = null;
531 manifest_count++;
532 }
533 }
534 }
535 }
536 }
537 if (manifest_writer != null)
538 {
539 manifest_writer.close();
540 manifest_writer = null;
541 }
542 search_paths.clear();
543 search_paths = null;
544 break;
545
546 case INDEX:
547 // 3b. Load the manifest specified by the search path and index files
548 System.out.println("Mode: Index");
549 System.out.println("Manifest Path: " + a_path.toString());
550 System.out.println("Prefix: " + index_prefix);
551 System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
552 System.out.println(divider);
553 file_indexer.loadManifest(a_path);
554 file_indexer.runIndex();
555 break;
556
557 case MERGE:
558 System.out.println("Mode: Merge");
559 System.out.println(divider);
560 // 3c. User has requested a merging process.
561 logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");
562 // Look in the default index path and locate all of the indexes to merge
563 // (all those not prefixed 'data')
564 ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
565 // We merge the indexes two at a time, creating new intermediate indexes,
566 // while there are still more than two indexes left in the queue. We
567 // delete indexes that we have already merged.
568 int intermediate_file_counter = 0;
569 while (index_parts.size() > 2)
570 {
571 String index_part_one_prefix = index_parts.remove();
572 String index_part_two_prefix = index_parts.remove();
573 String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
574 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
575 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
576 index_part_one_prefix = null;
577 index_part_two_prefix = null;
578 // - add the intermediate index to the list of indexes to be merged
579 index_parts.add(intermediate_index_prefix);
580 // - and increment the intermediate file count for the next iteration
581 intermediate_file_counter++;
582 }
583 // If there are two left we merge them into the final index
584 if (index_parts.size() == 2)
585 {
586 String index_part_one_prefix = index_parts.remove();
587 String index_part_two_prefix = index_parts.remove();
588 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
589 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
590 index_part_one_prefix = null;
591 index_part_two_prefix = null;
592 }
593 // Otherwise we just rename the files that are there into the final
594 // index
595 else if (index_parts.size() == 1)
596 {
597 String index_part_prefix = index_parts.remove();
598 logger.info("Renaming index \"" + index_part_prefix + "\" into final index");
599 file_indexer.renameIndex(index_part_prefix);
600 index_part_prefix = null;
601 }
602 // no index parts detected - was merge run by mistake?
603 else
604 {
605 logger.warn("No index parts detected - merge has no effect");
606 }
607 // clean up
608 index_parts.clear();
609 index_parts = null;
610 logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!");
611 break;
612
613 default:
614 logger.warn("Unknown FileIndexer mode requested");
615 }
616
617 // 4. Finish indexing
618 file_indexer.close();
619 file_indexer = null;
620
621 // 5. Complete!
622 System.out.println(divider);
623 System.out.println("Complete!");
624 System.out.println(divider);
625
626 System.out.println("");
627 }
628 /** main() **/
629}
Note: See TracBrowser for help on using the repository browser.