Changeset 31271 for other-projects/hathitrust/wcsa/extracted-features-solr
- Timestamp:
- 2016-12-28T14:04:19+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumePOSStreamFlatmap.java
r31258 r31271 41 41 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 42 42 43 ArrayList<String> all_ word_list = new ArrayList<String>();43 ArrayList<String> all_pos_list = new ArrayList<String>(); 44 44 45 45 if (extracted_feature_record != null) { … … 62 62 } 63 63 64 if (_verbosity >= 2) {64 if (_verbosity >= 3) { 65 65 System.out.print(" Pages: "); 66 66 } … … 70 70 String page_id = volume_id + "." + formatted_i; 71 71 72 if (_verbosity >= 2) {72 if (_verbosity >= 3) { 73 73 if (i>0) { 74 74 System.out.print(", "); … … 78 78 79 79 if (i==(ef_page_count-1)) { 80 if (_verbosity >= 2) {80 if (_verbosity >= 3) { 81 81 System.out.println(); 82 82 } … … 87 87 if (ef_page != null) { 88 88 89 ArrayList<String> page_ word_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);90 all_ word_list.addAll(page_word_list);89 ArrayList<String> page_pos_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page); 90 all_pos_list.addAll(page_pos_list); 91 91 } 92 92 else { … … 109 109 _progress_accum.add(_progress_step); 110 110 111 return all_ word_list.iterator();111 return all_pos_list.iterator(); 112 112 } 113 113 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java
r31270 r31271 69 69 70 70 //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 71 int files_per_partition = Integer.getInteger("wcsa-ef-ingest. num-partitions", DEFAULT_FILES_PER_PARTITION);71 int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION); 72 72 73 74 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,files_per_partition).cache(); 73 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache(); 75 74 json_list_data.setName("JSON-file-list"); 76 75 … … 81 80 82 81 JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions); 82 json_list_data_rp.setName("JSON-file-list--repartitioned"); 83 83 84 84 DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java
r31264 r31271 27 27 // http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 28 28 29 protected static final int DEFAULT_NUM_CORES = 6; 30 protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES; 29 //protected static final int DEFAULT_NUM_CORES = 6; 30 //protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES; 31 protected static final int DEFAULT_FILES_PER_PARTITION = 3000; 31 32 32 33 protected String _input_dir; … … 53 54 public void execPOSCount() 54 55 { 55 String spark_app_name = generateSparkAppName("Per Page");56 String spark_app_name = generateSparkAppName("Per Volume"); 56 57 57 58 SparkConf conf = new SparkConf().setAppName(spark_app_name); … … 67 68 } 68 69 69 int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 70 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 70 //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 71 int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION); 72 73 74 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache(); 71 75 json_list_data.setName("JSON-file-list"); 72 76 … … 74 78 double per_vol = 100.0/(double)num_volumes; 75 79 80 int num_partitions = (int)(num_volumes/files_per_partition)+1; 81 JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions); 82 json_list_data_rp.setName("JSON-file-list--repartitioned"); 83 76 84 DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 77 85 78 86 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 79 //boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");80 87 81 88 PerVolumePOSStreamFlatmap paged_solr_posfreq_flatmap … … 83 90 per_vol_progress_accum,per_vol, 84 91 strict_file_io); 85 JavaRDD<String> pos_list = json_list_data .flatMap(paged_solr_posfreq_flatmap);92 JavaRDD<String> pos_list = json_list_data_rp.flatMap(paged_solr_posfreq_flatmap); 86 93 pos_list.setName("pos-stream"); 87 94 -
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31269 r31271 100 100 101 101 //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 102 int files_per_partition = Integer.getInteger("wcsa-ef-ingest. num-partitions", DEFAULT_FILES_PER_PARTITION);102 int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION); 103 103 104 104 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
Note:
See TracChangeset
for help on using the changeset viewer.