Changeset 31264


Ignore:
Timestamp:
12/21/16 13:47:56 (4 years ago)
Author:
davidb
Message:

Switching to 'long' in counts to allow higher number representation

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java

    r31261 r31264  
    8686   
    8787       
    88         JavaPairRDD<String, Integer> lang_pairs = lang_list.mapToPair(s -> new Tuple2<String, Integer>(s, 1));
     88        JavaPairRDD<String, Long> lang_pairs = lang_list.mapToPair(s -> new Tuple2<String, Long>(s, 1L));
    8989        lang_pairs.setName("single-lang-count");
    9090       
    91         JavaPairRDD<String, Integer> lang_counts = lang_pairs.reduceByKey((a, b) -> a + b);
     91        JavaPairRDD<String, Long> lang_counts = lang_pairs.reduceByKey((a, b) -> a + b);
    9292        lang_counts.setName("lang-frequency");
    9393       
    94         JavaPairRDD<Integer, String> lang_counts_swapped_pair
     94        JavaPairRDD<Long, String> lang_counts_swapped_pair
    9595            = lang_counts.mapToPair(item -> item.swap());
    9696        lang_counts_swapped_pair.setName("frequency-lang-swap");
    9797       
    98         JavaPairRDD<Integer, String> lang_counts_swapped_pair_sorted
     98        JavaPairRDD<Long, String> lang_counts_swapped_pair_sorted
    9999            = lang_counts_swapped_pair.sortByKey(false, num_partitions);
    100100        lang_counts_swapped_pair_sorted.setName("descending-sorted-frequency-lang");
    101101       
    102         JavaPairRDD<String, Integer> lang_count_sorted
     102        JavaPairRDD<String, Long> lang_count_sorted
    103103            = lang_counts_swapped_pair_sorted.mapToPair(item -> item.swap());
    104104        lang_count_sorted.setName("descending-lang-frequency");
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java

    r31263 r31264  
    9898        */
    9999       
    100         JavaPairRDD<String, Long> pos_pairs = pos_list.mapToPair(s -> new Tuple2<String, Long>(s, (long)1));
     100        JavaPairRDD<String, Long> pos_pairs = pos_list.mapToPair(s -> new Tuple2<String, Long>(s, 1L));
    101101        pos_pairs.setName("single-pos-count");
    102102       
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java

    r31259 r31264  
    9191        words.setName("tokenized-words");
    9292       
    93         JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() {
    94             public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); }
     93        JavaPairRDD<String, Long> pairs = words.mapToPair(new PairFunction<String, String, Long>() {
     94            public Tuple2<String, Long> call(String s) { return new Tuple2<String, Long>(s, 1L); }
    9595        });
    9696        pairs.setName("single-word-count");
    9797       
    98         JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
    99             public Integer call(Integer a, Integer b) { return a + b; }
     98        JavaPairRDD<String, Long> counts = pairs.reduceByKey(new Function2<Long, Long, Long>() {
     99            public Long call(Long a, Long b) { return a + b; }
    100100        });
    101101        counts.setName("word-frequency");
    102102       
    103103        /*
    104         JavaPairRDD<Integer, String> swapped_pair = counts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() {
     104        JavaPairRDD<Long, String> swapped_pair = counts.mapToPair(new PairFunction<Tuple2<String, Long>, Long, String>() {
    105105               @Override
    106                public Tuple2<Integer, String> call(Tuple2<String, Integer> item) throws Exception {
     106               public Tuple2<Long, String> call(Tuple2<String, Long> item) throws Exception {
    107107                   return item.swap();
    108108               }
     
    111111        swapped_pair.setName("frequency-word-swap");
    112112       
    113         JavaPairRDD<Integer, String> sorted_swapped_pair = swapped_pair.sortByKey(false,num_partitions);
     113        JavaPairRDD<Long, String> sorted_swapped_pair = swapped_pair.sortByKey(false,num_partitions);
    114114        sorted_swapped_pair.setName("descending-sorted-frequency-word");
    115115       
    116         JavaPairRDD<String, Integer> sorted_swaped_back_pair = sorted_swapped_pair.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() {
     116        JavaPairRDD<String, Long> sorted_swaped_back_pair = sorted_swapped_pair.mapToPair(new PairFunction<Tuple2<Long, String>, String, Long>() {
    117117               @Override
    118                public Tuple2<String, Integer> call(Tuple2<Integer, String> item) throws Exception {
     118               public Tuple2<String, Long> call(Tuple2<Long, String> item) throws Exception {
    119119                   return item.swap();
    120120               }
     
    124124       
    125125
    126         JavaPairRDD<Integer, String> counts_swapped_pair
     126        JavaPairRDD<Long, String> counts_swapped_pair
    127127            = counts.mapToPair(item -> item.swap());
    128128        counts_swapped_pair.setName("frequency-word-swap");
    129129       
    130         JavaPairRDD<Integer, String> counts_swapped_pair_sorted
     130        JavaPairRDD<Long, String> counts_swapped_pair_sorted
    131131            = counts_swapped_pair.sortByKey(false, num_partitions);
    132132        counts_swapped_pair_sorted.setName("descending-sorted-frequency-word");
    133133       
    134         JavaPairRDD<String, Integer> count_sorted
     134        JavaPairRDD<String, Long> count_sorted
    135135            = counts_swapped_pair_sorted.mapToPair(item -> item.swap());
    136136        count_sorted.setName("descending-word-frequency");
Note: See TracChangeset for help on using the changeset viewer.