Changeset 31368
- Timestamp:
- 2017-01-30T10:02:27+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForCatalogLangCount.java
r31365 r31368 8 8 import org.apache.commons.cli.*; 9 9 import org.apache.hadoop.io.Text; 10 import org.apache.hadoop.mapred.SequenceFileOutputFormat; 10 11 import org.apache.spark.api.java.*; 11 12 import org.apache.spark.api.java.function.Function2; … … 46 47 public void execCatalogLangCountSparkDirect() 47 48 { 48 SparkConf conf = new SparkConf().setAppName("Spark-Direct + Per Volume : Downsample");49 SparkConf conf = new SparkConf().setAppName("Spark-Direct + Per Volume"); 49 50 JavaSparkContext jsc = new JavaSparkContext(conf); 50 51 … … 108 109 } 109 110 110 public void sampleDown ()111 { 112 String spark_app_name = generateSparkAppName("Spark Cluster + Per Volume ");111 public void sampleDown10000() 112 { 113 String spark_app_name = generateSparkAppName("Spark Cluster + Per Volume: Downsample 10000"); 113 114 114 115 SparkConf conf = new SparkConf().setAppName(spark_app_name); … … 126 127 127 128 128 129 jsc.close(); 130 } 131 132 public void sampleDown100() 133 { 134 String spark_app_name = generateSparkAppName("Spark Cluster + Per Volume: Downsample 100"); 135 136 SparkConf conf = new SparkConf().setAppName(spark_app_name); 137 JavaSparkContext jsc = new JavaSparkContext(conf); 138 jsc.hadoopConfiguration().set("io.compression.codec.bzip2.library", "java-builtin"); 139 140 String packed_sequence_path = "hdfs:///user/capitanu/data/packed-ef"; 141 142 JavaPairRDD<Text, Text> input_pair_rdd = jsc.sequenceFile(packed_sequence_path, Text.class, Text.class); 143 144 JavaPairRDD<Text, Text> json_text_sample_rdd = input_pair_rdd.sample(false,0.01,42); 145 146 JavaPairRDD<Text, Text> json_text_sample_repart_rdd = json_text_sample_rdd.repartition(120); 147 148 String output_directory = "packed-full-ef-100"; 149 //json_text_sample_repart_rdd.saveAsTextFile(output_directory); 150 //json_text_sample_repart_rdd.saveAsSequenceFile(output_directory); 151 json_text_sample_repart_rdd.saveAsHadoopFile(output_directory, Text.class, Text.class, SequenceFileOutputFormat.class); 152 153 154 jsc.close(); 129 155 } 130 156 public void execCatalogLangCount() … … 255 281 256 282 //prep_for_lang.execCatalogLangCount(); 257 prep_for_lang.sampleDown ();283 prep_for_lang.sampleDown100(); 258 284 259 285 }
Note:
See TracChangeset
for help on using the changeset viewer.