Changeset 26237


Ignore:
Timestamp:
09/24/12 12:11:36 (8 years ago)
Author:
jmt12
Message:

Allows the specification of a maxfiles - limiting the number of files written to manifests and hence processed by Terrier

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs2-extensions/parallel-building/trunk/src/opt/Terrier/FileIndexer.java

    r26194 r26237  
    1414import java.util.ArrayDeque;
    1515import java.util.ArrayList;
     16import java.util.Arrays;
     17import java.util.Collections;
    1618import java.util.List;
    1719import java.util.HashSet;
     
    406408    String index_prefix = ApplicationSetup.getProperty("terrier.index.prefix", "data");
    407409    int batch_size = 0; // No limit
     410    int max_files = 0; // No limit
    408411    String divider = "===============================================================================";
    409412
     
    412415    if (args.length < 1)
    413416    {
    414       System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>]");
     417      System.out.println("Usage: <FileIndexer> -prepare -path <path> [-batchsize <int>] [-maxfiles <int>]");
    415418      System.out.println("       <FileIndexer> -index -path <path> [-prefix <str>]");
    416419      System.out.println("       <FileIndexer> -merge");
     
    439442        batch_size = Integer.parseInt(value);
    440443      }
     444      else if (key.equals("-maxfiles"))
     445      {
     446        max_files = Integer.parseInt(value);
     447      }
    441448      else if (key.equals("-path"))
    442449      {
     
    476483        System.out.println("Batch Size: " + batch_size);
    477484      }
     485      if (max_files != 0)
     486      {
     487        System.out.println("Max Files: " + max_files);
     488      }
    478489      System.out.println(divider);
    479490      //    - we're going to perform a breadth-first recursive search for files
     
    483494      search_paths.add(a_path);
    484495      int file_count = 0;
     496      int total_file_count = 0; // Used in conjunction with max_files
    485497      int manifest_count = 0;
     498      boolean have_max_files = false;
    486499      PrintWriter manifest_writer = null;
    487       while (!search_paths.isEmpty())
     500      while (!search_paths.isEmpty() && !have_max_files)
    488501      {
    489502        Path search_path = search_paths.remove();
    490         File files[] = search_path.toFile().listFiles();
    491         for (int i = 0; i < files.length; i++)
     503        File files_raw[] = search_path.toFile().listFiles();
     504        // randomize file order
     505        List<File> files = Arrays.asList(files_raw);
     506        Collections.shuffle(files);
     507        // iterate through files filling manifests
     508        for (int i = 0; i < files.size() && !have_max_files; i++)
    492509        {
    493           Path file_path = files[i].toPath();
     510          Path file_path = files.get(i).toPath();
    494511          String file_name = file_path.getFileName().toString();
    495512          // skip . and ..
     
    524541              manifest_writer.println(file_path.toString());
    525542              file_count++;
    526               // if we have a file limit, and we've found enough files...
     543              total_file_count++;
     544              // if we have a batch size, and we've found enough files...
    527545              if (manifest_writer != null && batch_size > 0 && file_count >= batch_size)
    528546              {
     
    531549                manifest_count++;
    532550                file_count = 0;
     551              }
     552              // if we have a max files limit and we've done enough files, then
     553              // break out of for and while loops
     554              if (max_files > 0 && file_count >= max_files)
     555              {
     556                have_max_files = true;
    533557              }
    534558            }
Note: See TracChangeset for help on using the changeset viewer.