Ignore:
Timestamp:
2016-12-28T14:04:19+13:00 (7 years ago)
Author:
davidb
Message:

Updating of POS code to new files-per-partition paramater, plus some other related tweaks

Location:
other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumePOSStreamFlatmap.java

    r31258 r31271  
    4141        JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
    4242       
    43         ArrayList<String> all_word_list = new ArrayList<String>();
     43        ArrayList<String> all_pos_list = new ArrayList<String>();
    4444       
    4545        if (extracted_feature_record != null) {
     
    6262            }
    6363   
    64             if (_verbosity >= 2) {
     64            if (_verbosity >= 3) {
    6565                System.out.print("  Pages: ");
    6666            }
     
    7070                String page_id = volume_id + "." + formatted_i;
    7171
    72                 if (_verbosity >= 2) {
     72                if (_verbosity >= 3) {
    7373                    if (i>0) {
    7474                        System.out.print(", ");
     
    7878
    7979                if (i==(ef_page_count-1)) {
    80                     if (_verbosity >= 2) {
     80                    if (_verbosity >= 3) {
    8181                        System.out.println();
    8282                    }
     
    8787                if (ef_page != null) {
    8888                   
    89                     ArrayList<String> page_word_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);                 
    90                     all_word_list.addAll(page_word_list);
     89                    ArrayList<String> page_pos_list = SolrDocJSON.generateTokenPosCountPOSLabels(volume_id, page_id, ef_page);                 
     90                    all_pos_list.addAll(page_pos_list);
    9191                }
    9292                else {
     
    109109        _progress_accum.add(_progress_step);
    110110       
    111         return all_word_list.iterator();
     111        return all_pos_list.iterator();
    112112    }
    113113   
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForLangCount.java

    r31270 r31271  
    6969       
    7070        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
    71         int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION);
     71        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION);
    7272       
    73        
    74         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,files_per_partition).cache();
     73        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
    7574        json_list_data.setName("JSON-file-list");
    7675       
     
    8180       
    8281        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions);
     82        json_list_data_rp.setName("JSON-file-list--repartitioned");
    8383       
    8484        DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForPOSCount.java

    r31264 r31271  
    2727    //   http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
    2828   
    29     protected static final int DEFAULT_NUM_CORES = 6;
    30     protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;
     29    //protected static final int DEFAULT_NUM_CORES = 6;
     30    //protected static final int DEFAULT_NUM_PARTITIONS = 3*DEFAULT_NUM_CORES;
     31    protected static final int DEFAULT_FILES_PER_PARTITION = 3000;
    3132   
    3233    protected String _input_dir;
     
    5354    public void execPOSCount()
    5455    {   
    55         String spark_app_name = generateSparkAppName("Per Page");       
     56        String spark_app_name = generateSparkAppName("Per Volume");     
    5657       
    5758        SparkConf conf = new SparkConf().setAppName(spark_app_name);
     
    6768        }
    6869       
    69         int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
    70         JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,num_partitions).cache();
     70        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
     71        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION);
     72       
     73       
     74        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
    7175        json_list_data.setName("JSON-file-list");
    7276       
     
    7478        double per_vol = 100.0/(double)num_volumes;
    7579
     80        int num_partitions = (int)(num_volumes/files_per_partition)+1;
     81        JavaRDD<String> json_list_data_rp = json_list_data.repartition(num_partitions);
     82        json_list_data_rp.setName("JSON-file-list--repartitioned");
     83       
    7684        DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
    7785       
    7886        boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
    79         //boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
    8087       
    8188        PerVolumePOSStreamFlatmap paged_solr_posfreq_flatmap
     
    8390                                     per_vol_progress_accum,per_vol,
    8491                                     strict_file_io);
    85         JavaRDD<String> pos_list = json_list_data.flatMap(paged_solr_posfreq_flatmap);
     92        JavaRDD<String> pos_list = json_list_data_rp.flatMap(paged_solr_posfreq_flatmap);
    8693        pos_list.setName("pos-stream");
    8794       
  • other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31269 r31271  
    100100           
    101101        //int num_partitions = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_NUM_PARTITIONS);
    102         int files_per_partition = Integer.getInteger("wcsa-ef-ingest.num-partitions", DEFAULT_FILES_PER_PARTITION);
     102        int files_per_partition = Integer.getInteger("wcsa-ef-ingest.files-per-partition", DEFAULT_FILES_PER_PARTITION);
    103103       
    104104        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename).cache();
Note: See TracChangeset for help on using the changeset viewer.