source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java

Last change on this file was 32106, checked in by davidb, 6 years ago

Rekindle ability to process a json-filelist.txt using Spark

  • Property svn:executable set to *
File size: 1.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.HashMap;
6import java.util.Iterator;
7
8import org.apache.hadoop.io.Text;
9import org.apache.spark.api.java.function.FlatMapFunction;
10import org.apache.spark.api.java.function.Function;
11import org.apache.spark.api.java.function.VoidFunction;
12import org.apache.spark.util.DoubleAccumulator;
13import org.json.JSONArray;
14import org.json.JSONObject;
15
16/*
17class PagedJSON implements Function<String, Boolean> {
18
19 private static final long serialVersionUID = 1L;
20
21 public Boolean call(String s) { return s.contains("a"); }
22}
23 */
24
25
26//public class PerVolumeJSON implements VoidFunction<String>
27public class PerVolumeJSON implements Function<Text,Integer>
28{
29 private static final long serialVersionUID = 1L;
30 protected PerVolumeUtil _per_vol_util;
31
32 public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directory,
33 ArrayList<String> solr_endpoints, String output_dir, int verbosity,
34 boolean icu_tokenize, boolean strict_file_io)
35 {
36
37 // Had issues with class not found in Spark when set up with inheritance
38 _per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
39 solr_endpoints, output_dir, verbosity,
40 icu_tokenize, strict_file_io);
41
42 }
43
44
45 public Integer call(Text json_text) throws IOException
46
47 {
48 return _per_vol_util.call(json_text);
49 }
50}
51
Note: See TracBrowser for help on using the repository browser.