Changeset 31002 for other-projects

Show
Ignore:
Timestamp:
31.10.2016 00:07:39 (3 years ago)
Author:
davidb
Message:

Need to separate flatMap and foreach calls in PagedJSON

Location:
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures
Files:
2 added
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PagedJSON.java

    r31001 r31002  
    11package org.hathitrust.extractedfeatures; 
    22 
    3 import java.io.BufferedReader; 
    4 import java.io.BufferedWriter; 
    5 import java.io.IOException; 
    6 import java.io.InputStreamReader; 
    7 import java.io.OutputStream; 
    8 import java.net.HttpURLConnection; 
    9 import java.net.URL; 
    103import java.util.ArrayList; 
    114import java.util.Iterator; 
    12 import java.util.Set; 
    13  
    14 import org.apache.commons.compress.compressors.CompressorException; 
    155import org.apache.spark.api.java.function.FlatMapFunction; 
    166import org.apache.spark.api.java.function.VoidFunction; 
     
    2919 
    3020 
    31 class PagedJSON implements FlatMapFunction<String, JSONObject>, VoidFunction<JSONObject>  
     21class PagedJSON extends BasePerJSON implements FlatMapFunction<String, JSONObject> 
    3222//public class PagedJSON implements VoidFunction<String>  
    3323{ 
    3424    private static final long serialVersionUID = 1L; 
    35  
    36     protected String _input_dir; 
    37     protected String _solr_url; 
    38     protected String _output_dir; 
    39     protected int    _verbosity; 
    40      
    41     DoubleAccumulator _progress_accum; 
    42     double            _progress_step; 
    4325     
    4426    public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity,  
    4527                     DoubleAccumulator progress_accum, double progress_step) 
    4628    { 
    47         _input_dir  = input_dir; 
    48         _solr_url   = solr_url; 
    49         _output_dir = output_dir; 
    50         _verbosity  = verbosity; 
    51          
    52         _progress_accum = progress_accum; 
    53         _progress_step  = progress_step; 
     29        super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step); 
    5430    } 
    55      
    5631     
    5732     
     
    125100    } 
    126101     
    127     public void call(JSONObject solr_add_doc_json)  
    128     {  
    129         String output_json_bz2 = solr_add_doc_json.getString("filename_json_bz2"); 
    130         solr_add_doc_json.remove("filename_json_bz2"); 
    131          
    132         boolean random_test = (Math.random()>0.999); // every 1000 
    133          
    134         if ((_verbosity >=2) && (random_test)) { 
    135             System.out.println("=================="); 
    136             System.out.println("Sample output Solr add JSON [random test 1/1000]: " + solr_add_doc_json.toString()); 
    137             System.out.println("=================="); 
    138         } 
    139          
    140                      
    141         if (_solr_url != null) { 
    142             if ((_verbosity >=2) && (random_test)) { 
    143                 System.out.println("=================="); 
    144                 System.out.println("Posting to: " + _solr_url); 
    145                 System.out.println("=================="); 
    146             } 
    147             JSONSolrTransform.postSolrDoc(_solr_url, solr_add_doc_json); 
    148         } 
    149  
    150         if (_output_dir != null) { 
    151             if ((_verbosity >=2) && (random_test)) { 
    152                 System.out.println("=================="); 
    153                 System.out.println("Saving to: " + _output_dir); 
    154                 System.out.println("=================="); 
    155             } 
    156             JSONSolrTransform.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2); 
    157         } 
    158     } 
    159102     
    160103} 
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31001 r31002  
    2828 
    2929 
    30 public class PerVolumeJSON implements VoidFunction<String>  
     30public class PerVolumeJSON extends BasePerJSON implements VoidFunction<String>  
    3131{ 
    3232    private static final long serialVersionUID = 1L; 
    33  
    34     protected String _input_dir; 
    35     protected String _solr_url; 
    36     protected String _output_dir; 
    37     protected int    _verbosity; 
    38      
    39     DoubleAccumulator _progress_accum; 
    40     double            _progress_step; 
    4133     
    4234    public PerVolumeJSON(String input_dir, String solr_url, String output_dir, int verbosity,  
    4335                         DoubleAccumulator progress_accum, double progress_step) 
    4436    { 
    45         _input_dir  = input_dir; 
    46         _solr_url   = solr_url; 
    47         _output_dir = output_dir; 
    48         _verbosity  = verbosity; 
    49          
    50         _progress_accum = progress_accum; 
    51         _progress_step  = progress_step; 
     37        super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step); 
    5238    } 
    5339         
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31001 r31002  
    100100        JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache(); 
    101101         
    102         json_per_page_ids.foreach(paged_json); 
     102        PagedJSONForeach paged_json_foreach = new PagedJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 
     103        json_per_page_ids.foreach(paged_json_foreach); 
    103104 
    104105/*