Changeset 31011


Ignore:
Timestamp:
2016-10-31T15:40:36+13:00 (5 years ago)
Author:
davidb
Message:

Further RDD flatMap/map restructuring and refactoring, for per-page

Location:
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures
Files:
2 added
1 deleted
2 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31007 r31011  
    11package org.hathitrust.extractedfeatures;
    22
    3 import java.io.BufferedReader;
    4 import java.io.BufferedWriter;
    5 import java.io.IOException;
    6 import java.io.InputStreamReader;
    7 import java.io.OutputStream;
    8 import java.net.HttpURLConnection;
    9 import java.net.URL;
    10 import java.util.ArrayList;
    11 import java.util.Set;
    12 
    13 import org.apache.commons.compress.compressors.CompressorException;
    14 import org.apache.spark.api.java.function.FlatMapFunction;
    153import org.apache.spark.api.java.function.VoidFunction;
    164import org.apache.spark.util.DoubleAccumulator;
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31010 r31011  
    119119        DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
    120120       
    121         PerPageJSONFlatmap paged_json = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
    122         JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache();
    123        
    124         PerPageJSONForeach paged_json_foreach = new PerPageJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
    125         json_per_page_ids.foreach(paged_json_foreach);
     121        PerPageJSONFlatmap paged_solr_json_flatmap = new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
     122        JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache();
     123       
     124        PerPageJSONMap paged_json_id_map = new PerPageJSONMap(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
     125        JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map);
    126126
    127127/*
     
    133133*/
    134134       
    135         long num_page_ids = json_per_page_ids.count();
     135        long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol -> map:per-page
    136136       
    137137        System.out.println("");
Note: See TracChangeset for help on using the changeset viewer.