source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSONList.java@ 32106

Last change on this file since 32106 was 32106, checked in by davidb, 6 years ago

Rekindle ability to process a json-filelist.txt using Spark

  • Property svn:executable set to *
File size: 1.4 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.Iterator;
7
8import org.apache.hadoop.io.Text;
9import org.apache.spark.api.java.function.FlatMapFunction;
10import org.apache.spark.api.java.function.Function;
11import org.apache.spark.api.java.function.VoidFunction;
12import org.apache.spark.util.DoubleAccumulator;
13import org.json.JSONArray;
14import org.json.JSONObject;
15
16public class PerVolumeJSONList implements Function<String,Integer>
17{
18 private static final long serialVersionUID = 1L;
19 protected PerVolumeUtil _per_vol_util;
20
21 public PerVolumeJSONList(String input_dir, String whitelist_filename, String langmap_directory,
22 ArrayList<String> solr_endpoints, String output_dir, int verbosity,
23 boolean icu_tokenize, boolean strict_file_io)
24 {
25 _per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
26 solr_endpoints, output_dir, verbosity,
27 icu_tokenize, strict_file_io);
28
29 }
30
31 public Integer call(String json_file_in) throws IOException
32 {
33 // Read in JSON file as Text
34 String full_json_file_in = _per_vol_util.getInputDir() + "/" + json_file_in;
35 String json_content_string = ClusterFileIO.readTextFile(full_json_file_in);
36
37 Text json_content_text = new Text(json_content_string);
38
39 return _per_vol_util.call(json_content_text);
40 }
41}
42
Note: See TracBrowser for help on using the repository browser.