source: gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java@ 26194

Last change on this file since 26194 was 26194, checked in by jmt12, 12 years ago

Fixed bug where I wasn't reseting the number of files handled when moving to new manifest file (and so the first manifest had the correct batch size, but all subsequent ones contained only 1 file)

File size: 22.7 KB
Line 
1package org.terrier.applications;
2
3import java.io.BufferedReader;
4import java.io.BufferedOutputStream;
5import java.io.File;
6import java.io.FileInputStream;
7import java.io.FileOutputStream;
8import java.io.InputStreamReader;
9import java.io.IOException;
10import java.io.PrintWriter;
11import java.nio.file.Files;
12import java.nio.file.Path;
13import java.nio.file.Paths;
14import java.util.ArrayDeque;
15import java.util.ArrayList;
16import java.util.List;
17import java.util.HashSet;
18import java.util.Iterator;
19
20import org.apache.log4j.Logger;
21
22import org.terrier.indexing.BlockIndexer;
23import org.terrier.indexing.BlockSinglePassIndexer;
24import org.terrier.indexing.Collection;
25import org.terrier.indexing.Indexer;
26import org.terrier.indexing.SimpleFileCollection;
27import org.terrier.structures.CollectionStatistics;
28import org.terrier.structures.Index;
29import org.terrier.structures.merging.StructureMerger;
30import org.terrier.utility.ApplicationSetup;
31
32/** @class FileIndexer
33 *
34 * A simple indexer, based on DesktopTerrier, that is intended to build a
35 * searchable index from the contents of a directory in the file system. This
36 * indexer has been created with parallel processing in mind, in that you can
37 * split a large or computationally complex directory into several batches
38 * (the contents of which are recorded by manifest files) and then process
39 * each batch independently (and in parallel on a multicore machine or a
40 * cluster). Once this is complete you can then merge each pair of indexes
41 * together - eventually generating a final index encompassing all of the
42 * batches parts.
43 *
44 * Start by preparing the collection for indexing by locating indexable files
45 * and recording them in one or more manifest files, i.e.:
46 *
47 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -prepare \
48 * -path /data/mycollection/ -batchsize 10
49 *
50 * You then build the uniquely named index (in this case with a prefix of
51 * '000') for each manifest, i.e.:
52 *
53 * ./bin/anyclass.sh org.terrier.applications.FileIndexer -index \
54 * -path /terrier/var/manifest-000.spec -prefix 000
55 *
56 * If there are more index (for example, with prefixes '000' and '001'), you
57 * can then merge them (into a single index with the prefix 'data') using this
58 * command:
59 *
60 * ./bin/anyclass.sh \
61 * org.terrier.applications.structures.merging.BlockStructureMerger \
62 * /terrier/var/index/ 000 /terrier/var/index/ 001 \
63 * /terrier/var/index/ data
64 *
65 */
66public class FileIndexer
67{
68 private String index_prefix;
69
70 private HashSet<String> supported_extensions;
71
72 private List<String> file_list;
73
74 protected static final Logger logger = Logger.getLogger(FileIndexer.class);
75
76 /** @function FileIndexer
77 * Default constructor
78 */
79 public FileIndexer(String index_prefix)
80 {
81 logger.info("FileIndexer::FileIndexer(" + index_prefix + ")");
82 this.index_prefix = index_prefix;
83 this.supported_extensions = new HashSet<String>();
84 this.file_list = new ArrayList<String>();
85
86 //setting properties for the application
87 if ((ApplicationSetup.getProperty("indexer.meta.forward.keys", null)) == null)
88 {
89 ApplicationSetup.setProperty("indexer.meta.forward.keys","docno,filename");
90 ApplicationSetup.setProperty("indexer.meta.forward.keylens","26,2048");
91 }
92 ApplicationSetup.setProperty("indexing.max.tokens", "10000");
93 ApplicationSetup.setProperty("invertedfile.processterms","25000");
94 ApplicationSetup.setProperty("ignore.low.idf.terms","false");
95 ApplicationSetup.setProperty("matching.dsms", "BooleanFallback");
96
97 // we need to know what extensions are supported by the indexer
98 String extension_parsers_raw = ApplicationSetup.getProperty("indexing.simplefilecollection.extensionsparsers","txt:FileDocument");
99 // parse up the string, and store the extensions (only) in the HashSet
100 if (extension_parsers_raw.length() > 0)
101 {
102 String[] extension_parsers = extension_parsers_raw.split("\\s*,\\s*");
103 // for each parser, of the form: <ext>:<documentclass>
104 for (int i = 0; i < extension_parsers.length; i++)
105 {
106 String extension_parser = extension_parsers[i];
107 String[] mapping = extension_parser.split(":");
108 // a well-formed mapping has an extension and a document class
109 if (mapping.length == 2)
110 {
111 // we store just the extension, leaving the actual mapping up to
112 // SimpleFileCollection
113 supported_extensions.add(mapping[0]);
114 }
115 else
116 {
117 logger.warn("Malformed extension parser mapping: " + extension_parser);
118 }
119 }
120 }
121 else
122 {
123 logger.error("No extension parsers defined in Terrier's properties - SimpleFileCollection unusable");
124 }
125 }
126 /** FileIndexer() **/
127
128 /** @function canIndex
129 * Given a path to a file, determine if Terrier's current configuration
130 * allows that file to be indexed
131 */
132 public boolean canIndex(String file_path)
133 {
134 logger.info("FileIndexer::canIndex(" + file_path.toString() + ")");
135 // we may have specified a default Document class for all files encountered
136 if (!ApplicationSetup.getProperty("indexing.simplefilecollection.defaultparser","").equals(""))
137 {
138 return true;
139 }
140 // otherwise, check through the list of supported file extensions to see
141 // if one matches this file
142 // - get the file's extension
143 String file_extension = file_path.substring(file_path.lastIndexOf(".") + 1);
144 // - see if it exists in the array of supported extensions
145 return this.supported_extensions.contains(file_extension);
146 }
147 /** canIndex(String) **/
148
149 /** @function close
150 */
151 public void close()
152 {
153 logger.info("FileIndexer::close()");
154 this.file_list.clear();
155 this.file_list = null;
156 this.supported_extensions.clear();
157 this.supported_extensions = null;
158 }
159 /** close() **/
160
161 /** @function deleteIndex
162 */
163 public void deleteIndex(String prefix)
164 {
165 logger.info("FileIndexer::deleteIndex(" + prefix + ")");
166 // and that all old files for this index are removed
167 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
168 File files[] = index_path.listFiles();
169 for (int i = 0; i < files.length; i++)
170 {
171 String file_name = files[i].getName();
172 if (file_name.startsWith(prefix))
173 {
174 files[i].delete();
175 }
176 }
177 }
178 /** deleteIndex(String) **/
179
180 /** @function listIndexes
181 */
182 public ArrayDeque<String> listIndexes(boolean include_default)
183 {
184 logger.info("FileIndexer::listIndexes(" + include_default + ")");
185 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
186 String default_index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
187 // we start by populating a set structure to ensure each prefix only occurs
188 // once
189 HashSet<String> index_parts_set = new HashSet<String>();
190 File files[] = index_path.listFiles();
191 for (int i = 0; i < files.length; i++)
192 {
193 String file_name = files[i].getName();
194 if (include_default || !file_name.startsWith(default_index_prefix))
195 {
196 String prefix = file_name.substring(0, file_name.indexOf("."));
197 index_parts_set.add(prefix);
198 }
199 }
200 // we then turn the hashset into a nice list (in this case a deque)
201 ArrayDeque<String> index_parts = new ArrayDeque<String>();
202 Iterator<String> index_parts_iterator = index_parts_set.iterator();
203 while (index_parts_iterator.hasNext())
204 {
205 index_parts.add(index_parts_iterator.next());
206 }
207 return index_parts;
208 }
209 /** listIndexes(boolean) **/
210
211 /**
212 */
213 public void loadManifest(Path manifest_path)
214 {
215 logger.info("FileIndexer::loadManifest(" + manifest_path.toString() + ")");
216 try
217 {
218 BufferedReader manifest_reader = new BufferedReader(new InputStreamReader(new FileInputStream(manifest_path.toFile())));
219 String line = "";
220 while ((line = manifest_reader.readLine()) != null)
221 {
222 this.file_list.add(line);
223 }
224 manifest_reader.close();
225 manifest_reader = null;
226 }
227 catch (IOException e)
228 {
229 logger.error("Exception when reading manifest! " + e);
230 }
231 }
232 /** loadManifest(Path) **/
233
234 /** @function mergeIndexes
235 */
236 public boolean mergeIndexes(String prefix_one, String prefix_two)
237 {
238 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ")");
239 String prefix_default = ApplicationSetup.getProperty("terrier.index.prefix", "data");
240 return this.mergeIndexes(prefix_one, prefix_two, prefix_default);
241 }
242 /** mergeIndexes(String, String) **/
243
244 /** @function mergeIndexes
245 */
246 public boolean mergeIndexes(String prefix_one, String prefix_two, String prefix_out)
247 {
248 logger.info("FileIndexer::mergeIndexes(" + prefix_one + ", " + prefix_two + ", " + prefix_out + ")");
249 // init
250 String index_path = ApplicationSetup.TERRIER_INDEX_PATH;
251 // use StructureMerger class
252 Index.setIndexLoadingProfileAsRetrieval(false);
253 Index index_one = Index.createIndex(index_path, prefix_one);
254 Index index_two = Index.createIndex(index_path, prefix_two);
255 Index index_out = Index.createNewIndex(index_path, prefix_out);
256 StructureMerger structure_merger = new StructureMerger(index_one, index_two, index_out);
257 structure_merger.mergeStructures();
258 structure_merger = null;
259 CollectionStatistics collection_statistics = index_out.getCollectionStatistics();
260 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
261 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
262 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
263 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
264 collection_statistics = null;
265 try
266 {
267 index_one.close();
268 index_two.close();
269 index_out.close();
270 }
271 catch (IOException e)
272 {
273 logger.error("Exception while closing indexes: ", e);
274 return false;
275 }
276 index_one = null;
277 index_two = null;
278 index_out = null;
279 this.deleteIndex(prefix_one);
280 this.deleteIndex(prefix_two);
281 prefix_one = null;
282 prefix_two = null;
283 return true;
284 }
285 /** mergeIndexes(String, String, String) **/
286
287 /** @function renameIndex
288 */
289 public void renameIndex(String prefix_in)
290 {
291 logger.info("FileIndexer::renameIndex(" + prefix_in + ")");
292 String default_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
293 this.renameIndex(prefix_in, default_prefix);
294 }
295 /** renameIndex(String) **/
296
297 /** @function renameIndex
298 */
299 public void renameIndex(String prefix_in, String prefix_out)
300 {
301 logger.info("FileIndexer::renameIndex(" + prefix_in + ", " + prefix_out + ")");
302 prefix_in = prefix_in + ".";
303 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
304 File files[] = index_path.listFiles();
305 for (int i = 0; i < files.length; i++)
306 {
307 File a_file = files[i];
308 String file_name = a_file.getName();
309 if (file_name.startsWith(prefix_in))
310 {
311 String suffix = file_name.substring(file_name.indexOf("."));
312 Path target_path = Paths.get(ApplicationSetup.TERRIER_INDEX_PATH, prefix_out + suffix);
313 try
314 {
315 Files.move(a_file.toPath(), target_path);
316 }
317 catch (IOException e)
318 {
319 System.err.println("Error! Failed to rename file: " + e);
320 }
321 target_path = null;
322 suffix = null;
323 }
324 file_name = null;
325 a_file = null;
326 }
327 files = null;
328 index_path = null;
329 }
330 /** renameIndex(String, String) **/
331
332 /** @function runIndex
333 */
334 public void runIndex()
335 {
336 logger.info("FileIndexer::runIndex()");
337 if (this.file_list == null || this.file_list.size() == 0)
338 {
339 logger.error("No files specified to index. Aborting indexing process.");
340 return;
341 }
342
343 try
344 {
345 // ensure the index directory exists
346 File index_path = new File(ApplicationSetup.TERRIER_INDEX_PATH);
347 if (!index_path.exists() && !index_path.mkdirs())
348 {
349 logger.error("Could not create the index folders at: "+ index_path);
350 logger.error("Aborting indexing process");
351 return;
352 }
353 this.deleteIndex(this.index_prefix);
354 // create the appropriate indexer
355 Indexer indexer;
356 final boolean use_single_pass = Boolean.parseBoolean(ApplicationSetup.getProperty("desktop.indexing.singlepass", "false"));
357 logger.warn("BLOCK_INDEXING hardcoded default for SimpleFileCollection");
358 if (use_single_pass)
359 {
360 indexer = new BlockSinglePassIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
361 }
362 else
363 {
364 indexer = new BlockIndexer(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
365 }
366 // create the simple file collection object and hand it to the indexer
367 // for indexing
368 SimpleFileCollection sfc = new SimpleFileCollection(this.file_list, false);
369 indexer.index(new Collection[] { sfc });
370 Index the_index = Index.createIndex(ApplicationSetup.TERRIER_INDEX_PATH, this.index_prefix);
371 // output some indexing statistics
372 if (the_index != null)
373 {
374 CollectionStatistics collection_statistics = the_index.getCollectionStatistics();
375 logger.info("Number of Documents: " + collection_statistics.getNumberOfDocuments());
376 logger.info("Number of Tokens: " + collection_statistics.getNumberOfTokens());
377 logger.info("Number of Unique Terms: " + collection_statistics.getNumberOfUniqueTerms());
378 logger.info("Number of Pointers: " + collection_statistics.getNumberOfPointers());
379 collection_statistics = null;
380 }
381 else
382 {
383 logger.warn("Nothing indexed!");
384 }
385 }
386 catch(Exception e)
387 {
388 logger.error("An unexpected exception occured while indexing. Indexing has been aborted.",e);
389 }
390 logger.info("FileIndexer::runIndex() - Complete!");
391 }
392 /** runIndex(List<String>) **/
393
394 /** @function main
395 */
396 public static void main(String[] args)
397 {
398 System.out.println("================================= FileIndexer =================================");
399
400 // 0. Initialization
401 final int PREPARE = 0;
402 final int INDEX = 1;
403 final int MERGE = 2;
404 int index_mode = PREPARE;
405 Path a_path = null;
406 String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
407 int batch_size = 0; // No limit
408 String divider = "===============================================================================";
409
410 // 1. Parse arguments
411 // - first argument is the mode (prepare|index|merge)
412 if (args.length < 1)
413 {
414 System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]");
415 System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]");
416 System.out.println(" <FileIndexer> -merge");
417 System.out.println("where: paths *must* be absolute");
418 System.out.println(" when preparing 'path' is to the collection directory");
419 System.out.println(" when indexing 'path' is to a manifest file (generated by prepare)");
420 System.out.println(" prefix is 'data' by default");
421 System.out.println(" batch_size is unlimited by default");
422 System.exit(0);
423 }
424 if (args[0].equals("-index"))
425 {
426 index_mode = INDEX;
427 }
428 else if (args[0].equals("-merge"))
429 {
430 index_mode = MERGE;
431 }
432 // - all other arguments
433 for (int argc = 1; (argc + 1) < args.length; argc += 2)
434 {
435 String key = args[argc];
436 String value = args[argc + 1];
437 if (key.equals("-batchsize"))
438 {
439 batch_size = Integer.parseInt(value);
440 }
441 else if (key.equals("-path"))
442 {
443 a_path = Paths.get(value);
444 }
445 else if (key.equals("-prefix"))
446 {
447 index_prefix = value;
448 }
449 else
450 {
451 System.err.println("Warning! Unknown argument: " + key);
452 }
453 }
454 // - check arguments
455 if (index_mode != MERGE && (a_path == null || !Files.exists(a_path)))
456 {
457 System.err.println("Error! Required argument -path not set or invalid.");
458 System.err.println("");
459 System.exit(0);
460 }
461
462 // 2. Create the file indexer
463 FileIndexer file_indexer = new FileIndexer(index_prefix);
464
465 switch(index_mode)
466 {
467 case PREPARE:
468 // 3a. If we are in preparation mode, this is where we go through the
469 // files in the search path, adding those that are indexable by
470 // Terrier's current configuration into one of more manifest files
471 // based upon the value of batch_size.
472 System.out.println("Mode: Preparation");
473 System.out.println("Collection Path: " + a_path.toString());
474 if (batch_size != 0)
475 {
476 System.out.println("Batch Size: " + batch_size);
477 }
478 System.out.println(divider);
479 // - we're going to perform a breadth-first recursive search for files
480 // using the canIndex() function of the indexer to determine if this
481 // is (or would be) a file to index
482 ArrayDeque<Path> search_paths = new ArrayDeque<Path>();
483 search_paths.add(a_path);
484 int file_count = 0;
485 int manifest_count = 0;
486 PrintWriter manifest_writer = null;
487 while (!search_paths.isEmpty())
488 {
489 Path search_path = search_paths.remove();
490 File files[] = search_path.toFile().listFiles();
491 for (int i = 0; i < files.length; i++)
492 {
493 Path file_path = files[i].toPath();
494 String file_name = file_path.getFileName().toString();
495 // skip . and ..
496 if (file_name.equals(".") || file_name.equals(".."))
497 {
498 // do nothing
499 }
500 // if this is a directory, add it to the queue of paths to search
501 else if (Files.isDirectory(file_path))
502 {
503 search_paths.add(file_path);
504 }
505 else
506 {
507 // would we expect to index this file?
508 if (file_indexer.canIndex(file_name))
509 {
510 // Ensure we opened a file handle for writing this manifest
511 if (manifest_writer == null)
512 {
513 Path manifest_path = Paths.get(ApplicationSetup.TERRIER_VAR, "manifest-" + String.format("%03d", manifest_count) + ".spec");
514 try
515 {
516 manifest_writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(manifest_path.toFile())));
517 }
518 catch (IOException e)
519 {
520 System.err.println("Error when opening manifest for writing! " + e);
521 }
522 System.out.println("* Writing to manifest: " + manifest_path.toString());
523 }
524 manifest_writer.println(file_path.toString());
525 file_count++;
526 // if we have a file limit, and we've found enough files...
527 if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
528 {
529 manifest_writer.close();
530 manifest_writer = null;
531 manifest_count++;
532 file_count = 0;
533 }
534 }
535 }
536 }
537 }
538 if (manifest_writer != null)
539 {
540 manifest_writer.close();
541 manifest_writer = null;
542 }
543 search_paths.clear();
544 search_paths = null;
545 break;
546
547 case INDEX:
548 // 3b. Load the manifest specified by the search path and index files
549 System.out.println("Mode: Index");
550 System.out.println("Manifest Path: " + a_path.toString());
551 System.out.println("Prefix: " + index_prefix);
552 System.out.println("Indexing: " + ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno,filename"));
553 System.out.println(divider);
554 file_indexer.loadManifest(a_path);
555 file_indexer.runIndex();
556 break;
557
558 case MERGE:
559 System.out.println("Mode: Merge");
560 System.out.println(divider);
561 // 3c. User has requested a merging process.
562 logger.info("[" + (System.currentTimeMillis()/1000) + "] Merging started");
563 // Look in the default index path and locate all of the indexes to merge
564 // (all those not prefixed 'data')
565 ArrayDeque<String> index_parts = file_indexer.listIndexes(false);
566 // We merge the indexes two at a time, creating new intermediate indexes,
567 // while there are still more than two indexes left in the queue. We
568 // delete indexes that we have already merged.
569 int intermediate_file_counter = 0;
570 while (index_parts.size() > 2)
571 {
572 String index_part_one_prefix = index_parts.remove();
573 String index_part_two_prefix = index_parts.remove();
574 String intermediate_index_prefix = "temp" + String.format("%03d", intermediate_file_counter);
575 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into index \"" + intermediate_index_prefix + "\"");
576 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix, intermediate_index_prefix);
577 index_part_one_prefix = null;
578 index_part_two_prefix = null;
579 // - add the intermediate index to the list of indexes to be merged
580 index_parts.add(intermediate_index_prefix);
581 // - and increment the intermediate file count for the next iteration
582 intermediate_file_counter++;
583 }
584 // If there are two left we merge them into the final index
585 if (index_parts.size() == 2)
586 {
587 String index_part_one_prefix = index_parts.remove();
588 String index_part_two_prefix = index_parts.remove();
589 logger.info("Merging index parts \"" + index_part_one_prefix + "\" and \"" + index_part_two_prefix + "\" into final index");
590 file_indexer.mergeIndexes(index_part_one_prefix, index_part_two_prefix);
591 index_part_one_prefix = null;
592 index_part_two_prefix = null;
593 }
594 // Otherwise we just rename the files that are there into the final
595 // index
596 else if (index_parts.size() == 1)
597 {
598 String index_part_prefix = index_parts.remove();
599 logger.info("Renaming index \"" + index_part_prefix + "\" into final index");
600 file_indexer.renameIndex(index_part_prefix);
601 index_part_prefix = null;
602 }
603 // no index parts detected - was merge run by mistake?
604 else
605 {
606 logger.warn("No index parts detected - merge has no effect");
607 }
608 // clean up
609 index_parts.clear();
610 index_parts = null;
611 logger.info("[" + (System.currentTimeMillis()/1000) + "] The merging is complete!");
612 break;
613
614 default:
615 logger.warn("Unknown FileIndexer mode requested");
616 }
617
618 // 4. Finish indexing
619 file_indexer.close();
620 file_indexer = null;
621
622 // 5. Complete!
623 System.out.println(divider);
624 System.out.println("Complete!");
625 System.out.println(divider);
626
627 System.out.println("");
628 }
629 /** main() **/
630}
Note: See TracBrowser for help on using the repository browser.