Ignore:
Timestamp:
2016-10-29T15:45:38+13:00 (7 years ago)
Author:
davidb
Message:

Introduction of Spark accumulator to measure progress. Output of POST read in and status checked for

File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/PrepareForIngest.java

    r30979 r30984  
    55
    66import org.apache.spark.api.java.*;
     7import org.apache.spark.util.DoubleAccumulator;
    78import org.apache.spark.SparkConf;
    89
     
    4647        JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
    4748
    48         PagedJSON paged_json = new PagedJSON(_input_dir, _solr_url,_output_dir,_verbosity);
     49        long num_volumes = json_list_data.count();
     50        double per_vol = 100.0/(double)num_volumes;
     51       
     52        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("ProgressPercent");
     53
     54        //sc.parallelize(Arrays.asList(1, 2, 3, 4)).foreach(x -> accum.add(x));
     55        // ...
     56        // 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
     57
     58        //accum.value();
     59       
     60        PagedJSON paged_json = new PagedJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
    4961        JavaRDD<String> json_ids = json_list_data.flatMap(paged_json).cache();
    5062
Note: See TracChangeset for help on using the changeset viewer.