Changeset 31270 for other-projects/hathitrust/wcsa
- Timestamp:
- 2016-12-28T10:36:17+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java
r31264 r31270 27 27 // http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 28 28 29 protected static final int DEFAULT_NUM_CORES = 6; 30 protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES; 29 //protected static final int DEFAULT_NUM_CORES = 6; 30 //protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES; 31 protected static final int DEFAULT_FILES_PER_PARTITION = 3000; 31 32 32 33 protected String _input_dir; … … 53 54 public void execLangCount() 54 55 { 55 String spark_app_name = generateSparkAppName("Per Page");56 String spark_app_name = generateSparkAppName("Per Volume"); 56 57 57 58 SparkConf conf = new SparkConf().setAppName(spark_app_name); … … 67 68 } 68 69 69 int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 70 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 70 //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 71 int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION); 72 73 74 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,files_per_partition).cache(); 71 75 json_list_data.setName("JSON-file-list"); 72 76 … … 74 78 double per_vol = 100.0/(double)num_volumes; 75 79 80 int num_partitions = (int)(num_volumes/files_per_partition)+1; 81 82 JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions); 83 76 84 DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 77 85 … … 82 90 per_vol_progress_accum,per_vol, 83 91 strict_file_io); 84 JavaRDD<String> lang_list = json_list_data .flatMap(paged_solr_langfreq_flatmap);92 JavaRDD<String> lang_list = json_list_data_rp.flatMap(paged_solr_langfreq_flatmap); 85 93 lang_list.setName("lang-stream"); 86 94
Note:
See TracChangeset
for help on using the changeset viewer.