[32106] | 1 | package org.hathitrust.extractedfeatures;
|
---|
| 2 |
|
---|
| 3 | import java.io.IOException;
|
---|
| 4 | import java.util.ArrayList;
|
---|
| 5 | import java.util.HashMap;
|
---|
| 6 | import java.util.Iterator;
|
---|
| 7 |
|
---|
| 8 | import org.apache.hadoop.io.Text;
|
---|
| 9 | import org.apache.spark.api.java.function.FlatMapFunction;
|
---|
| 10 | import org.apache.spark.api.java.function.Function;
|
---|
| 11 | import org.apache.spark.api.java.function.VoidFunction;
|
---|
| 12 | import org.apache.spark.util.DoubleAccumulator;
|
---|
| 13 | import org.json.JSONArray;
|
---|
| 14 | import org.json.JSONObject;
|
---|
| 15 |
|
---|
| 16 | public class PerVolumeJSONList implements Function<String,Integer>
|
---|
| 17 | {
|
---|
| 18 | private static final long serialVersionUID = 1L;
|
---|
| 19 | protected PerVolumeUtil _per_vol_util;
|
---|
| 20 |
|
---|
| 21 | public PerVolumeJSONList(String input_dir, String whitelist_filename, String langmap_directory,
|
---|
| 22 | ArrayList<String> solr_endpoints, String output_dir, int verbosity,
|
---|
| 23 | boolean icu_tokenize, boolean strict_file_io)
|
---|
| 24 | {
|
---|
| 25 | _per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
|
---|
| 26 | solr_endpoints, output_dir, verbosity,
|
---|
| 27 | icu_tokenize, strict_file_io);
|
---|
| 28 |
|
---|
| 29 | }
|
---|
| 30 |
|
---|
| 31 | public Integer call(String json_file_in) throws IOException
|
---|
| 32 | {
|
---|
| 33 | // Read in JSON file as Text
|
---|
| 34 | String full_json_file_in = _per_vol_util.getInputDir() + "/" + json_file_in;
|
---|
| 35 | String json_content_string = ClusterFileIO.readTextFile(full_json_file_in);
|
---|
| 36 |
|
---|
| 37 | Text json_content_text = new Text(json_content_string);
|
---|
| 38 |
|
---|
| 39 | return _per_vol_util.call(json_content_text);
|
---|
| 40 | }
|
---|
| 41 | }
|
---|
| 42 |
|
---|