1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.IOException;
|
---|
4 | import java.util.ArrayList;
|
---|
5 | import java.util.HashMap;
|
---|
6 | import java.util.Iterator;
|
---|
7 |
|
---|
8 | import org.apache.hadoop.io.Text;
|
---|
9 | import org.apache.spark.api.java.function.FlatMapFunction;
|
---|
10 | import org.apache.spark.api.java.function.Function;
|
---|
11 | import org.apache.spark.api.java.function.VoidFunction;
|
---|
12 | import org.apache.spark.util.DoubleAccumulator;
|
---|
13 | import org.json.JSONArray;
|
---|
14 | import org.json.JSONObject;
|
---|
15 |
|
---|
16 | public class PerVolumeJSONList implements Function<String,Integer>
|
---|
17 | {
|
---|
18 | private static final long serialVersionUID = 1L;
|
---|
19 | protected PerVolumeUtil _per_vol_util;
|
---|
20 |
|
---|
21 | public PerVolumeJSONList(String input_dir, String whitelist_filename, String langmap_directory,
|
---|
22 | ArrayList<String> solr_endpoints, String output_dir, int verbosity,
|
---|
23 | boolean icu_tokenize, boolean strict_file_io)
|
---|
24 | {
|
---|
25 | _per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
|
---|
26 | solr_endpoints, output_dir, verbosity,
|
---|
27 | icu_tokenize, strict_file_io);
|
---|
28 |
|
---|
29 | }
|
---|
30 |
|
---|
31 | public Integer call(String json_file_in) throws IOException
|
---|
32 | {
|
---|
33 | // Read in JSON file as Text
|
---|
34 | String full_json_file_in = _per_vol_util.getInputDir() + "/" + json_file_in;
|
---|
35 | String json_content_string = ClusterFileIO.readTextFile(full_json_file_in);
|
---|
36 |
|
---|
37 | Text json_content_text = new Text(json_content_string);
|
---|
38 |
|
---|
39 | return _per_vol_util.call(json_content_text);
|
---|
40 | }
|
---|
41 | }
|
---|
42 |
|
---|