Changeset 31002


Ignore:
Timestamp:
2016-10-31T00:07:39+13:00 (5 years ago)
Author:
davidb
Message:

Need to separate flatMap and foreach calls in PagedJSON

Location:
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PagedJSON.java

    r31001 r31002  
    11package org.hathitrust.extractedfeatures;
    22
    3 import java.io.BufferedReader;
    4 import java.io.BufferedWriter;
    5 import java.io.IOException;
    6 import java.io.InputStreamReader;
    7 import java.io.OutputStream;
    8 import java.net.HttpURLConnection;
    9 import java.net.URL;
    103import java.util.ArrayList;
    114import java.util.Iterator;
    12 import java.util.Set;
    13 
    14 import org.apache.commons.compress.compressors.CompressorException;
    155import org.apache.spark.api.java.function.FlatMapFunction;
    166import org.apache.spark.api.java.function.VoidFunction;
     
    2919
    3020
    31 class PagedJSON implements FlatMapFunction<String, JSONObject>, VoidFunction<JSONObject>
     21class PagedJSON extends BasePerJSON implements FlatMapFunction<String, JSONObject>
    3222//public class PagedJSON implements VoidFunction<String>
    3323{
    3424    private static final long serialVersionUID = 1L;
    35 
    36     protected String _input_dir;
    37     protected String _solr_url;
    38     protected String _output_dir;
    39     protected int    _verbosity;
    40    
    41     DoubleAccumulator _progress_accum;
    42     double            _progress_step;
    4325   
    4426    public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity,
    4527                     DoubleAccumulator progress_accum, double progress_step)
    4628    {
    47         _input_dir  = input_dir;
    48         _solr_url   = solr_url;
    49         _output_dir = output_dir;
    50         _verbosity  = verbosity;
    51        
    52         _progress_accum = progress_accum;
    53         _progress_step  = progress_step;
     29        super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step);
    5430    }
    55    
    5631   
    5732   
     
    125100    }
    126101   
    127     public void call(JSONObject solr_add_doc_json)
    128     {
    129         String output_json_bz2 = solr_add_doc_json.getString("filename_json_bz2");
    130         solr_add_doc_json.remove("filename_json_bz2");
    131        
    132         boolean random_test = (Math.random()>0.999); // every 1000
    133        
    134         if ((_verbosity >=2) && (random_test)) {
    135             System.out.println("==================");
    136             System.out.println("Sample output Solr add JSON [random test 1/1000]: " + solr_add_doc_json.toString());
    137             System.out.println("==================");
    138         }
    139        
    140                    
    141         if (_solr_url != null) {
    142             if ((_verbosity >=2) && (random_test)) {
    143                 System.out.println("==================");
    144                 System.out.println("Posting to: " + _solr_url);
    145                 System.out.println("==================");
    146             }
    147             JSONSolrTransform.postSolrDoc(_solr_url, solr_add_doc_json);
    148         }
    149 
    150         if (_output_dir != null) {
    151             if ((_verbosity >=2) && (random_test)) {
    152                 System.out.println("==================");
    153                 System.out.println("Saving to: " + _output_dir);
    154                 System.out.println("==================");
    155             }
    156             JSONSolrTransform.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
    157         }
    158     }
    159102   
    160103}
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

    r31001 r31002  
    2828
    2929
    30 public class PerVolumeJSON implements VoidFunction<String>
     30public class PerVolumeJSON extends BasePerJSON implements VoidFunction<String>
    3131{
    3232    private static final long serialVersionUID = 1L;
    33 
    34     protected String _input_dir;
    35     protected String _solr_url;
    36     protected String _output_dir;
    37     protected int    _verbosity;
    38    
    39     DoubleAccumulator _progress_accum;
    40     double            _progress_step;
    4133   
    4234    public PerVolumeJSON(String input_dir, String solr_url, String output_dir, int verbosity,
    4335                         DoubleAccumulator progress_accum, double progress_step)
    4436    {
    45         _input_dir  = input_dir;
    46         _solr_url   = solr_url;
    47         _output_dir = output_dir;
    48         _verbosity  = verbosity;
    49        
    50         _progress_accum = progress_accum;
    51         _progress_step  = progress_step;
     37        super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step);
    5238    }
    5339       
  • other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java

    r31001 r31002  
    100100        JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache();
    101101       
    102         json_per_page_ids.foreach(paged_json);
     102        PagedJSONForeach paged_json_foreach = new PagedJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
     103        json_per_page_ids.foreach(paged_json_foreach);
    103104
    104105/*
Note: See TracChangeset for help on using the changeset viewer.