Changeset 31095 for other-projects


Ignore:
Timestamp:
2016-11-10T18:58:06+13:00 (7 years ago)
Author:
davidb
Message:

Introduced num-partitions property

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties

    r31094 r31095  
    77wcsa-ef-ingest.strict-file-io = false
    88
     9# For guide on number of partitions to use, see "Parallelized collections" section of:
     10#   https://spark.apache.org/docs/2.0.1/programming-guide.html
     11# which suggests 2-4 * num_cores
     12#
     13# For a more detailed discussion see:
     14#   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
     15   
     16# wcsa-ef-ingest.num-partitions = 12
     17wcsa-ef-ingest.num-partitions = 120
     18
     19
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31091 r31095  
    2828    //   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
    2929   
    30     public static final int NUM_CORES = 10*12;
    31     public static final int NUM_PARTITIONS = 2*NUM_CORES; // default would appear to be 2
     30    protected static final int DEFAULT_NUM_CORES = 6;
     31    protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;
    3232   
    3333    protected String _input_dir;
     
    9898        }
    9999               
    100         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
     100        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
     101       
     102        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
    101103
    102104        long num_volumes = json_list_data.count();
     
    144146        }
    145147               
    146         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
     148        int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
     149        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
    147150
    148151        long num_volumes = json_list_data.count();
Note: See TracChangeset for help on using the changeset viewer.