Show
Ignore:
Timestamp:
31.10.2016 15:40:36 (3 years ago)
Author:
davidb
Message:

Further RDD flatMap/map restructuring and refactoring, for per-page

Location:
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures
Files:
2 added
1 removed
2 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31007 r31011  
    11package org.hathitrust.extractedfeatures; 
    22 
    3 import java.io.BufferedReader; 
    4 import java.io.BufferedWriter; 
    5 import java.io.IOException; 
    6 import java.io.InputStreamReader; 
    7 import java.io.OutputStream; 
    8 import java.net.HttpURLConnection; 
    9 import java.net.URL; 
    10 import java.util.ArrayList; 
    11 import java.util.Set; 
    12  
    13 import org.apache.commons.compress.compressors.CompressorException; 
    14 import org.apache.spark.api.java.function.FlatMapFunction; 
    153import org.apache.spark.api.java.function.VoidFunction; 
    164import org.apache.spark.util.DoubleAccumulator; 
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31010 r31011  
    119119        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent"); 
    120120         
    121         PerPageJSONFlatmap paged_json = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 
    122         JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache(); 
    123          
    124         PerPageJSONForeach paged_json_foreach = new PerPageJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 
    125         json_per_page_ids.foreach(paged_json_foreach); 
     121        PerPageJSONFlatmap paged_solr_json_flatmap = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 
     122        JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache(); 
     123         
     124        PerPageJSONMap paged_json_id_map = new PerPageJSONMap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 
     125        JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map); 
    126126 
    127127/* 
     
    133133*/ 
    134134         
    135         long num_page_ids = json_per_page_ids.count(); 
     135        long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol -> map:per-page 
    136136         
    137137        System.out.println("");