Changeset 31365 for other-projects/hathitrust/wcsa
- Timestamp:
- 2017-01-29T21:51:30+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForCatalogLangCount.java
r31364 r31365 46 46 public void execCatalogLangCountSparkDirect() 47 47 { 48 String spark_app_name = generateSparkAppName("Spark-Direct + Per Volume"); 49 50 SparkConf conf = new SparkConf().setAppName(spark_app_name); 48 SparkConf conf = new SparkConf().setAppName("Spark-Direct + Per Volume: Downsample"); 51 49 JavaSparkContext jsc = new JavaSparkContext(conf); 52 50 … … 110 108 } 111 109 110 public void sampleDown() 111 { 112 String spark_app_name = generateSparkAppName("Spark Cluster + Per Volume"); 113 114 SparkConf conf = new SparkConf().setAppName(spark_app_name); 115 JavaSparkContext jsc = new JavaSparkContext(conf); 116 jsc.hadoopConfiguration().set("io.compression.codec.bzip2.library", "java-builtin"); 117 118 String packed_sequence_path = "hdfs:///user/capitanu/data/packed-ef"; 119 120 JavaPairRDD<Text, Text> input_pair_rdd = jsc.sequenceFile(packed_sequence_path, Text.class, Text.class); 121 122 JavaPairRDD<Text, Text> json_text_sample_rdd = input_pair_rdd.sample(false,0.0001,42); 123 124 String output_directory = "packed-ef-10000"; 125 json_text_sample_rdd.saveAsTextFile(output_directory); 126 127 128 129 } 112 130 public void execCatalogLangCount() 113 131 { 114 115 132 116 133 String spark_app_name = generateSparkAppName("YARN Cluster + Per Volume"); 117 134 … … 237 254 = new ProcessForCatalogLangCount(input_dir,json_list_filename,verbosity); 238 255 239 prep_for_lang.execCatalogLangCount(); 256 //prep_for_lang.execCatalogLangCount(); 257 prep_for_lang.sampleDown(); 240 258 241 259 }
Note:
See TracChangeset
for help on using the changeset viewer.