Changeset 31002
- Timestamp:
- 2016-10-31T00:07:39+13:00 (7 years ago)
- Location:
- other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures
- Files:
-
- 2 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PagedJSON.java
r31001 r31002 1 1 package org.hathitrust.extractedfeatures; 2 2 3 import java.io.BufferedReader;4 import java.io.BufferedWriter;5 import java.io.IOException;6 import java.io.InputStreamReader;7 import java.io.OutputStream;8 import java.net.HttpURLConnection;9 import java.net.URL;10 3 import java.util.ArrayList; 11 4 import java.util.Iterator; 12 import java.util.Set;13 14 import org.apache.commons.compress.compressors.CompressorException;15 5 import org.apache.spark.api.java.function.FlatMapFunction; 16 6 import org.apache.spark.api.java.function.VoidFunction; … … 29 19 30 20 31 class PagedJSON implements FlatMapFunction<String, JSONObject>, VoidFunction<JSONObject>21 class PagedJSON extends BasePerJSON implements FlatMapFunction<String, JSONObject> 32 22 //public class PagedJSON implements VoidFunction<String> 33 23 { 34 24 private static final long serialVersionUID = 1L; 35 36 protected String _input_dir;37 protected String _solr_url;38 protected String _output_dir;39 protected int _verbosity;40 41 DoubleAccumulator _progress_accum;42 double _progress_step;43 25 44 26 public PagedJSON(String input_dir, String solr_url, String output_dir, int verbosity, 45 27 DoubleAccumulator progress_accum, double progress_step) 46 28 { 47 _input_dir = input_dir; 48 _solr_url = solr_url; 49 _output_dir = output_dir; 50 _verbosity = verbosity; 51 52 _progress_accum = progress_accum; 53 _progress_step = progress_step; 29 super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step); 54 30 } 55 56 31 57 32 … … 125 100 } 126 101 127 public void call(JSONObject solr_add_doc_json)128 {129 String output_json_bz2 = solr_add_doc_json.getString("filename_json_bz2");130 solr_add_doc_json.remove("filename_json_bz2");131 132 boolean random_test = (Math.random()>0.999); // every 1000133 134 if ((_verbosity >=2) && (random_test)) {135 System.out.println("==================");136 System.out.println("Sample output Solr add JSON [random test 1/1000]: " + solr_add_doc_json.toString());137 System.out.println("==================");138 }139 140 141 if (_solr_url != null) {142 if ((_verbosity >=2) && (random_test)) {143 System.out.println("==================");144 System.out.println("Posting to: " + _solr_url);145 System.out.println("==================");146 }147 JSONSolrTransform.postSolrDoc(_solr_url, solr_add_doc_json);148 }149 150 if (_output_dir != null) {151 if ((_verbosity >=2) && (random_test)) {152 System.out.println("==================");153 System.out.println("Saving to: " + _output_dir);154 System.out.println("==================");155 }156 JSONSolrTransform.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);157 }158 }159 102 160 103 } -
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java
r31001 r31002 28 28 29 29 30 public class PerVolumeJSON implements VoidFunction<String>30 public class PerVolumeJSON extends BasePerJSON implements VoidFunction<String> 31 31 { 32 32 private static final long serialVersionUID = 1L; 33 34 protected String _input_dir;35 protected String _solr_url;36 protected String _output_dir;37 protected int _verbosity;38 39 DoubleAccumulator _progress_accum;40 double _progress_step;41 33 42 34 public PerVolumeJSON(String input_dir, String solr_url, String output_dir, int verbosity, 43 35 DoubleAccumulator progress_accum, double progress_step) 44 36 { 45 _input_dir = input_dir; 46 _solr_url = solr_url; 47 _output_dir = output_dir; 48 _verbosity = verbosity; 49 50 _progress_accum = progress_accum; 51 _progress_step = progress_step; 37 super(input_dir,solr_url,output_dir,verbosity,progress_accum,progress_step); 52 38 } 53 39 -
other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java
r31001 r31002 100 100 JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache(); 101 101 102 json_per_page_ids.foreach(paged_json); 102 PagedJSONForeach paged_json_foreach = new PagedJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol); 103 json_per_page_ids.foreach(paged_json_foreach); 103 104 104 105 /*
Note:
See TracChangeset
for help on using the changeset viewer.