Changeset 31362


Ignore:
Timestamp:
01/27/17 16:38:08 (4 years ago)
Author:
davidb
Message:

use Spark sample() to make for smaller test with Sequence files

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForCatalogLangCount.java

    r31361 r31362  
    122122        String packed_sequence_path = "hdfs:///user/capitanu/data/packed-ef";
    123123
    124         JavaPairRDD<String, Text> inputRdd = jsc.sequenceFile(packed_sequence_path, String.class, Text.class);
    125         JavaRDD<Text> jsonTextRdd = inputRdd.map(Tuple2::_2);
    126 
     124        JavaPairRDD<Text, Text> input_pair_rdd = jsc.sequenceFile(packed_sequence_path, Text.class, Text.class);
     125        //JavaRDD<Text> jsonTextRdd = input_pair_rdd.map(Tuple2::_2);
     126        JavaRDD<Text> json_text_rdd = input_pair_rdd.map(item -> item._2);
     127       
     128        JavaRDD<Text> json_text_sample_rdd = json_text_rdd.sample(false,0.0001);
     129       
    127130        /*
    128131        jsonTextRdd.map(
     
    135138        PerVolumeCatalogLangSequenceFileMap volume_catalog_langfreq_map
    136139            = new PerVolumeCatalogLangSequenceFileMap(_input_dir,_verbosity,strict_file_io);
    137         JavaRDD<String> catalog_lang_list = jsonTextRdd.map(volume_catalog_langfreq_map);
     140        JavaRDD<String> catalog_lang_list = json_text_sample_rdd.map(volume_catalog_langfreq_map);
    138141        //catalog_lang_list.persist(StorageLevel.MEMORY_AND_DISK());
    139142        catalog_lang_list.setName("catalog-lang-stream");
Note: See TracChangeset for help on using the changeset viewer.