Changeset 31095 for other-projects/hathitrust
- Timestamp:
- 2016-11-10T18:58:06+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/ef-solr.properties
r31094 r31095 7 7 wcsa-ef-ingest.strict-file-io = false 8 8 9 # For guide on number of partitions to use, see "Parallelized collections" section of: 10 # https://spark.apache.org/docs/2.0.1/programming-guide.html 11 # which suggests 2-4 * num_cores 12 # 13 # For a more detailed discussion see: 14 # http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 15 16 # wcsa-ef-ingest.num-partitions = 12 17 wcsa-ef-ingest.num-partitions = 120 18 19 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31091 r31095 28 28 // http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 29 29 30 p ublic static final int NUM_CORES = 10*12;31 p ublic static final int NUM_PARTITIONS = 2*NUM_CORES; // default would appear to be 230 protected static final int DEFAULT_NUM_CORES = 6; 31 protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES; 32 32 33 33 protected String _input_dir; … … 98 98 } 99 99 100 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache(); 100 int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 101 102 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 101 103 102 104 long num_volumes = json_list_data.count(); … … 144 146 } 145 147 146 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache(); 148 int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 149 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 147 150 148 151 long num_volumes = json_list_data.count();
Note:
See TracChangeset
for help on using the changeset viewer.