Changeset 31251 for other-projects/hathitrust
- Timestamp:
- 2016-12-19T15:13:52+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java
r31250 r31251 60 60 JavaSparkContext jsc = new JavaSparkContext(conf); 61 61 62 /*63 if (_verbosity >= 2) {64 System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());65 System.out.println("Default Parallelism: " + jsc.defaultParallelism());66 }67 */68 69 62 int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 70 63 JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 71 64 json_list_data.setName("JSON-file-list"); 65 72 66 long num_volumes = json_list_data.count(); 73 67 double per_vol = 100.0/(double)num_volumes; … … 77 71 DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 78 72 79 //String strict_file_io_str = System.getProperty("wcsa-ef-ingest.strict-file-io","true");80 73 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 81 74 … … 84 77 per_vol_progress_accum,per_vol, 85 78 strict_file_io); 86 JavaRDD<String> words = json_list_data.flatMap(paged_solr_wordfreq_flatmap); // .cache() *****87 79 JavaRDD<String> words = json_list_data.flatMap(paged_solr_wordfreq_flatmap); 80 words.setName("tokenized-words"); 88 81 89 82 JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { 90 83 public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } 91 84 }); 92 85 pairs.setName("single-word-count"); 86 93 87 JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { 94 88 public Integer call(Integer a, Integer b) { return a + b; } 95 89 }); 96 97 //counts.map(lambda (x,y): (y,x)); 98 99 100 JavaPairRDD<Integer, String> swappedPair = counts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { 90 counts.setName("word-frequency"); 91 92 JavaPairRDD<Integer, String> swapped_pair = counts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { 101 93 @Override 102 94 public Tuple2<Integer, String> call(Tuple2<String, Integer> item) throws Exception { … … 105 97 106 98 }); 107 108 //JavaPairRDD<Integer, String> sorted_swapped_pair = swappedPair.sortByKey(false,num_partitions); 109 JavaPairRDD<Integer, String> sorted_swapped_pair = swappedPair.sortByKey(false,1); 110 99 swapped_pair.setName("frequency-word-swap"); 100 101 JavaPairRDD<Integer, String> sorted_swapped_pair = swapped_pair.sortByKey(false,num_partitions); 102 103 sorted_swapped_pair.setName("descending-sorted-frequency-word"); 104 111 105 JavaPairRDD<String, Integer> sorted_swaped_back_pair = sorted_swapped_pair.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { 112 106 @Override … … 114 108 return item.swap(); 115 109 } 116 117 110 }); 118 119 /*120 121 JavaPairRDD< String, Integer> sorted_counts122 = counts.map (item -> item.swap()) // interchanges position of entries in each tuple123 .sortByKey(true, 1) // 1st arg configures ascending sort, 2nd arg configures one task124 .map(item -> item.swap());125 111 sorted_swaped_back_pair.setName("descending-word-frequency"); 112 113 /* 114 JavaPairRDD<Integer, String> counts_swapped_pair 115 = counts.mapToPair(item -> item.swap()); 116 JavaPairRDD<Integer, String> counts_swapped_pair_sorted 117 = counts_swapped_pair.sortByKey(true, 1); 118 JavaPairRDD<String, Integer> count_sorted = counts_swapped_pair_sorted.mapToPair(item -> item.swap()); 126 119 */ 127 120 128 121 129 //sorted_counts.saveAsTextFile(_json_list_filename + ".out");130 122 String filename_root = _json_list_filename.replaceAll(".*/","").replaceAll("\\..*$",""); 131 123 String output_directory = "whitelist-" + filename_root + "-out";
Note:
See TracChangeset
for help on using the changeset viewer.