Changeset 26237
- Timestamp:
- 2012-09-24T12:11:36+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java
r26194 r26237 14 14 import java.util.ArrayDeque; 15 15 import java.util.ArrayList; 16 import java.util.Arrays; 17 import java.util.Collections; 16 18 import java.util.List; 17 19 import java.util.HashSet; … … 406 408 String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); 407 409 int batch_size = 0; // No limit 410 int max_files = 0; // No limit 408 411 String divider = "==============================================================================="; 409 412 … … 412 415 if (args.length < 1) 413 416 { 414 System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] ");417 System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]"); 415 418 System.out.println(" <FileIndexer> -index -path <path> [-prefix <str>]"); 416 419 System.out.println(" <FileIndexer> -merge"); … … 439 442 batch_size = Integer.parseInt(value); 440 443 } 444 else if (key.equals("-maxfiles")) 445 { 446 max_files = Integer.parseInt(value); 447 } 441 448 else if (key.equals("-path")) 442 449 { … … 476 483 System.out.println("Batch Size: " + batch_size); 477 484 } 485 if (max_files != 0) 486 { 487 System.out.println("Max Files: " + max_files); 488 } 478 489 System.out.println(divider); 479 490 // - we're going to perform a breadth-first recursive search for files … … 483 494 search_paths.add(a_path); 484 495 int file_count = 0; 496 int total_file_count = 0; // Used in conjunction with max_files 485 497 int manifest_count = 0; 498 boolean have_max_files = false; 486 499 PrintWriter manifest_writer = null; 487 while (!search_paths.isEmpty() )500 while (!search_paths.isEmpty() && !have_max_files) 488 501 { 489 502 Path search_path = search_paths.remove(); 490 File files[] = search_path.toFile().listFiles(); 491 for (int i = 0; i < files.length; i++) 503 File files_raw[] = search_path.toFile().listFiles(); 504 // randomize file order 505 List<File> files = Arrays.asList(files_raw); 506 Collections.shuffle(files); 507 // iterate through files filling manifests 508 for (int i = 0; i < files.size() && !have_max_files; i++) 492 509 { 493 Path file_path = files [i].toPath();510 Path file_path = files.get(i).toPath(); 494 511 String file_name = file_path.getFileName().toString(); 495 512 // skip . and .. … … 524 541 manifest_writer.println(file_path.toString()); 525 542 file_count++; 526 // if we have a file limit, and we've found enough files... 543 total_file_count++; 544 // if we have a batch size, and we've found enough files... 527 545 if (manifest_writer != null && batch_size > 0 && file_count >= batch_size) 528 546 { … … 531 549 manifest_count++; 532 550 file_count = 0; 551 } 552 // if we have a max files limit and we've done enough files, then 553 // break out of for and while loops 554 if (max_files > 0 && file_count >= max_files) 555 { 556 have_max_files = true; 533 557 } 534 558 }
Note:
See TracChangeset
for help on using the changeset viewer.