Changeset 31264

Show
Ignore:
Timestamp:
21.12.2016 13:47:56 (3 years ago)
Author:
davidb
Message:

Switching to 'long' in counts to allow higher number representation

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java

    r31261 r31264  
    8686     
    8787         
    88         JavaPairRDD<String, Integer> lang_pairs = lang_list.mapToPair(s -> new Tuple2<String, Integer>(s, 1)); 
     88        JavaPairRDD<String, Long> lang_pairs = lang_list.mapToPair(s -> new Tuple2<String, Long>(s, 1L)); 
    8989        lang_pairs.setName("single-lang-count"); 
    9090         
    91         JavaPairRDD<String, Integer> lang_counts = lang_pairs.reduceByKey((a, b) -> a + b); 
     91        JavaPairRDD<String, Long> lang_counts = lang_pairs.reduceByKey((a, b) -> a + b); 
    9292        lang_counts.setName("lang-frequency"); 
    9393         
    94         JavaPairRDD<Integer, String> lang_counts_swapped_pair 
     94        JavaPairRDD<Long, String> lang_counts_swapped_pair 
    9595            = lang_counts.mapToPair(item -> item.swap()); 
    9696        lang_counts_swapped_pair.setName("frequency-lang-swap"); 
    9797         
    98         JavaPairRDD<Integer, String> lang_counts_swapped_pair_sorted  
     98        JavaPairRDD<Long, String> lang_counts_swapped_pair_sorted  
    9999            = lang_counts_swapped_pair.sortByKey(false, num_partitions); 
    100100        lang_counts_swapped_pair_sorted.setName("descending-sorted-frequency-lang"); 
    101101         
    102         JavaPairRDD<String, Integer> lang_count_sorted  
     102        JavaPairRDD<String, Long> lang_count_sorted  
    103103            = lang_counts_swapped_pair_sorted.mapToPair(item -> item.swap()); 
    104104        lang_count_sorted.setName("descending-lang-frequency"); 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java

    r31263 r31264  
    9898        */ 
    9999         
    100         JavaPairRDD<String, Long> pos_pairs = pos_list.mapToPair(s -> new Tuple2<String, Long>(s, (long)1)); 
     100        JavaPairRDD<String, Long> pos_pairs = pos_list.mapToPair(s -> new Tuple2<String, Long>(s, 1L)); 
    101101        pos_pairs.setName("single-pos-count"); 
    102102         
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForWhitelist.java

    r31259 r31264  
    9191        words.setName("tokenized-words"); 
    9292         
    93         JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { 
    94             public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } 
     93        JavaPairRDD<String, Long> pairs = words.mapToPair(new PairFunction<String, String, Long>() { 
     94            public Tuple2<String, Long> call(String s) { return new Tuple2<String, Long>(s, 1L); } 
    9595        }); 
    9696        pairs.setName("single-word-count"); 
    9797         
    98         JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { 
    99             public Integer call(Integer a, Integer b) { return a + b; } 
     98        JavaPairRDD<String, Long> counts = pairs.reduceByKey(new Function2<Long, Long, Long>() { 
     99            public Long call(Long a, Long b) { return a + b; } 
    100100        }); 
    101101        counts.setName("word-frequency"); 
    102102         
    103103        /* 
    104         JavaPairRDD<Integer, String> swapped_pair = counts.mapToPair(new PairFunction<Tuple2<String, Integer>, Integer, String>() { 
     104        JavaPairRDD<Long, String> swapped_pair = counts.mapToPair(new PairFunction<Tuple2<String, Long>, Long, String>() { 
    105105               @Override 
    106                public Tuple2<Integer, String> call(Tuple2<String, Integer> item) throws Exception { 
     106               public Tuple2<Long, String> call(Tuple2<String, Long> item) throws Exception { 
    107107                   return item.swap(); 
    108108               } 
     
    111111        swapped_pair.setName("frequency-word-swap"); 
    112112         
    113         JavaPairRDD<Integer, String> sorted_swapped_pair = swapped_pair.sortByKey(false,num_partitions); 
     113        JavaPairRDD<Long, String> sorted_swapped_pair = swapped_pair.sortByKey(false,num_partitions); 
    114114        sorted_swapped_pair.setName("descending-sorted-frequency-word"); 
    115115         
    116         JavaPairRDD<String, Integer> sorted_swaped_back_pair = sorted_swapped_pair.mapToPair(new PairFunction<Tuple2<Integer, String>, String, Integer>() { 
     116        JavaPairRDD<String, Long> sorted_swaped_back_pair = sorted_swapped_pair.mapToPair(new PairFunction<Tuple2<Long, String>, String, Long>() { 
    117117               @Override 
    118                public Tuple2<String, Integer> call(Tuple2<Integer, String> item) throws Exception { 
     118               public Tuple2<String, Long> call(Tuple2<Long, String> item) throws Exception { 
    119119                   return item.swap(); 
    120120               } 
     
    124124         
    125125 
    126         JavaPairRDD<Integer, String> counts_swapped_pair 
     126        JavaPairRDD<Long, String> counts_swapped_pair 
    127127            = counts.mapToPair(item -> item.swap()); 
    128128        counts_swapped_pair.setName("frequency-word-swap"); 
    129129         
    130         JavaPairRDD<Integer, String> counts_swapped_pair_sorted  
     130        JavaPairRDD<Long, String> counts_swapped_pair_sorted  
    131131            = counts_swapped_pair.sortByKey(false, num_partitions); 
    132132        counts_swapped_pair_sorted.setName("descending-sorted-frequency-word"); 
    133133         
    134         JavaPairRDD<String, Integer> count_sorted  
     134        JavaPairRDD<String, Long> count_sorted  
    135135            = counts_swapped_pair_sorted.mapToPair(item -> item.swap()); 
    136136        count_sorted.setName("descending-word-frequency");