Changeset 31095 for other-projects

Show
Ignore:
Timestamp:
10.11.2016 18:58:06 (3 years ago)
Author:
davidb
Message:

Introduced num-partitions property

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties

    r31094 r31095  
    77wcsa-ef-ingest.strict-file-io = false 
    88 
     9# For guide on number of partitions to use, see "Parallelized collections" section of: 
     10#   https://spark.apache.org/docs/2.0.1/programming-guide.html 
     11# which suggests 2-4 * num_cores 
     12# 
     13# For a more detailed discussion see: 
     14#   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 
     15     
     16# wcsa-ef-ingest.num-partitions = 12 
     17wcsa-ef-ingest.num-partitions = 120 
     18 
     19 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31091 r31095  
    2828    //   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 
    2929     
    30     public static final int NUM_CORES = 10*12; 
    31     public static final int NUM_PARTITIONS = 2*NUM_CORES; // default would appear to be 2 
     30    protected static final int DEFAULT_NUM_CORES = 6; 
     31    protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;  
    3232     
    3333    protected String _input_dir; 
     
    9898        } 
    9999                 
    100         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache(); 
     100        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
     101         
     102        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 
    101103 
    102104        long num_volumes = json_list_data.count(); 
     
    144146        } 
    145147                 
    146         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache(); 
     148        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
     149        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 
    147150 
    148151        long num_volumes = json_list_data.count();