1 | package org.hathitrust.extractedfeatures;
|
---|
2 |
|
---|
3 | import java.io.IOException;
|
---|
4 | import java.util.ArrayList;
|
---|
5 | import java.util.HashMap;
|
---|
6 | import java.util.Iterator;
|
---|
7 |
|
---|
8 | import org.apache.hadoop.io.Text;
|
---|
9 | import org.apache.spark.api.java.function.FlatMapFunction;
|
---|
10 | import org.apache.spark.api.java.function.Function;
|
---|
11 | import org.apache.spark.api.java.function.VoidFunction;
|
---|
12 | import org.apache.spark.util.DoubleAccumulator;
|
---|
13 | import org.json.JSONArray;
|
---|
14 | import org.json.JSONObject;
|
---|
15 |
|
---|
16 | /*
|
---|
17 | class PagedJSON implements Function<String, Boolean> {
|
---|
18 |
|
---|
19 | private static final long serialVersionUID = 1L;
|
---|
20 |
|
---|
21 | public Boolean call(String s) { return s.contains("a"); }
|
---|
22 | }
|
---|
23 | */
|
---|
24 |
|
---|
25 |
|
---|
26 | //public class PerVolumeJSON implements VoidFunction<String>
|
---|
27 | public class PerVolumeJSON implements Function<Text,Integer>
|
---|
28 | {
|
---|
29 | private static final long serialVersionUID = 1L;
|
---|
30 | protected PerVolumeUtil _per_vol_util;
|
---|
31 |
|
---|
32 | public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directory,
|
---|
33 | ArrayList<String> solr_endpoints, String output_dir, int verbosity,
|
---|
34 | boolean icu_tokenize, boolean strict_file_io)
|
---|
35 | {
|
---|
36 |
|
---|
37 | // Had issues with class not found in Spark when set up with inheritance
|
---|
38 | _per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
|
---|
39 | solr_endpoints, output_dir, verbosity,
|
---|
40 | icu_tokenize, strict_file_io);
|
---|
41 |
|
---|
42 | }
|
---|
43 |
|
---|
44 |
|
---|
45 | public Integer call(Text json_text) throws IOException
|
---|
46 |
|
---|
47 | {
|
---|
48 | return _per_vol_util.call(json_text);
|
---|
49 | }
|
---|
50 | }
|
---|
51 |
|
---|