Changeset 31271

Show
Ignore:
Timestamp:
28.12.2016 14:04:19 (2 years ago)
Author:
davidb
Message:

Updating of POS code to new files-per-partition paramater, plus some other related tweaks

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumePOSStreamFlatmap.java

    r31258 r31271  
    4141        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in); 
    4242         
    43         ArrayList<String> all_word_list = new ArrayList<String>(); 
     43        ArrayList<String> all_pos_list = new ArrayList<String>(); 
    4444         
    4545        if (extracted_feature_record != null) { 
     
    6262            } 
    6363     
    64             if (_verbosity >= 2) { 
     64            if (_verbosity >= 3) { 
    6565                System.out.print("  Pages: "); 
    6666            } 
     
    7070                String page_id = volume_id + "." + formatted_i; 
    7171 
    72                 if (_verbosity >= 2) { 
     72                if (_verbosity >= 3) { 
    7373                    if (i>0) { 
    7474                        System.out.print(", "); 
     
    7878 
    7979                if (i==(ef_page_count-1)) { 
    80                     if (_verbosity >= 2) { 
     80                    if (_verbosity >= 3) { 
    8181                        System.out.println(); 
    8282                    } 
     
    8787                if (ef_page != null) { 
    8888                     
    89                     ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);                  
    90                     all_word_list.addAll(page_word_list); 
     89                    ArrayList<String> page_pos_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);                   
     90                    all_pos_list.addAll(page_pos_list); 
    9191                } 
    9292                else { 
     
    109109        _progress_accum.add(_progress_step); 
    110110         
    111         return all_word_list.iterator(); 
     111        return all_pos_list.iterator(); 
    112112    } 
    113113     
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java

    r31270 r31271  
    6969         
    7070        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
    71         int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION); 
     71        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION); 
    7272         
    73          
    74         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,files_per_partition).cache(); 
     73        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache(); 
    7574        json_list_data.setName("JSON-file-list"); 
    7675         
     
    8180         
    8281        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions); 
     82        json_list_data_rp.setName("JSON-file-list--repartitioned"); 
    8383         
    8484        DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java

    r31264 r31271  
    2727    //   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/ 
    2828     
    29     protected static final int DEFAULT_NUM_CORES = 6; 
    30     protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;  
     29    //protected static final int DEFAULT_NUM_CORES = 6; 
     30    //protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;  
     31    protected static final int DEFAULT_FILES_PER_PARTITION = 3000; 
    3132     
    3233    protected String _input_dir; 
     
    5354    public void execPOSCount() 
    5455    {    
    55         String spark_app_name = generateSparkAppName("Per Page");        
     56        String spark_app_name = generateSparkAppName("Per Volume");      
    5657         
    5758        SparkConf conf = new SparkConf().setAppName(spark_app_name); 
     
    6768        } 
    6869         
    69         int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
    70         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache(); 
     70        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
     71        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION); 
     72         
     73         
     74        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache(); 
    7175        json_list_data.setName("JSON-file-list"); 
    7276         
     
    7478        double per_vol = 100.0/(double)num_volumes; 
    7579 
     80        int num_partitions = (int)(num_volumes/files_per_partition)+1; 
     81        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions); 
     82        json_list_data_rp.setName("JSON-file-list--repartitioned"); 
     83         
    7684        DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent"); 
    7785         
    7886        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io"); 
    79         //boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize"); 
    8087         
    8188        PerVolumePOSStreamFlatmap paged_solr_posfreq_flatmap  
     
    8390                                     per_vol_progress_accum,per_vol, 
    8491                                     strict_file_io); 
    85         JavaRDD<String> pos_list = json_list_data.flatMap(paged_solr_posfreq_flatmap);  
     92        JavaRDD<String> pos_list = json_list_data_rp.flatMap(paged_solr_posfreq_flatmap);  
    8693        pos_list.setName("pos-stream"); 
    8794         
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31269 r31271  
    100100             
    101101        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS); 
    102         int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION); 
     102        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION); 
    103103         
    104104        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();