Changeset 26237

Show
Ignore:
Timestamp:
24.09.2012 12:11:36 (7 years ago)
Author:
jmt12
Message:

Allows the specification of a maxfiles - limiting the number of files written to manifests and hence processed by Terrier

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java

    r26194 r26237  
    1414import java.util.ArrayDeque; 
    1515import java.util.ArrayList; 
     16import java.util.Arrays; 
     17import java.util.Collections; 
    1618import java.util.List; 
    1719import java.util.HashSet; 
     
    406408    String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data"); 
    407409    int batch_size = 0; // No limit 
     410    int max_files = 0; // No limit 
    408411    String divider = "==============================================================================="; 
    409412 
     
    412415    if (args.length < 1) 
    413416    { 
    414       System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]"); 
     417      System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]"); 
    415418      System.out.println("       <FileIndexer> -index -path <path> [-prefix <str>]"); 
    416419      System.out.println("       <FileIndexer> -merge"); 
     
    439442        batch_size = Integer.parseInt(value); 
    440443      } 
     444      else if (key.equals("-maxfiles")) 
     445      { 
     446        max_files = Integer.parseInt(value); 
     447      } 
    441448      else if (key.equals("-path")) 
    442449      { 
     
    476483        System.out.println("Batch Size: " + batch_size); 
    477484      } 
     485      if (max_files != 0) 
     486      { 
     487        System.out.println("Max Files: " + max_files); 
     488      } 
    478489      System.out.println(divider); 
    479490      //    - we're going to perform a breadth-first recursive search for files 
     
    483494      search_paths.add(a_path); 
    484495      int file_count = 0; 
     496      int total_file_count = 0; // Used in conjunction with max_files 
    485497      int manifest_count = 0; 
     498      boolean have_max_files = false; 
    486499      PrintWriter manifest_writer = null; 
    487       while (!search_paths.isEmpty()) 
     500      while (!search_paths.isEmpty() && !have_max_files) 
    488501      { 
    489502        Path search_path = search_paths.remove(); 
    490         File files[] = search_path.toFile().listFiles(); 
    491         for (int i = 0; i < files.length; i++) 
     503        File files_raw[] = search_path.toFile().listFiles(); 
     504        // randomize file order 
     505        List<File> files = Arrays.asList(files_raw); 
     506        Collections.shuffle(files); 
     507        // iterate through files filling manifests 
     508        for (int i = 0; i < files.size() && !have_max_files; i++) 
    492509        { 
    493           Path file_path = files[i].toPath(); 
     510          Path file_path = files.get(i).toPath(); 
    494511          String file_name = file_path.getFileName().toString(); 
    495512          // skip . and .. 
     
    524541              manifest_writer.println(file_path.toString()); 
    525542              file_count++; 
    526               // if we have a file limit, and we've found enough files... 
     543              total_file_count++; 
     544              // if we have a batch size, and we've found enough files... 
    527545              if (manifest_writer != null && batch_size > 0 && file_count >= batch_size) 
    528546              { 
     
    531549                manifest_count++; 
    532550                file_count = 0; 
     551              } 
     552              // if we have a max files limit and we've done enough files, then 
     553              // break out of for and while loops 
     554              if (max_files > 0 && file_count >= max_files) 
     555              { 
     556                have_max_files = true; 
    533557              } 
    534558            }