Context Navigation

PerVolumeJSONList.java@ 32106

Last change on this file since 32106 was 32106, checked in by davidb, 6 years ago
Rekindle ability to process a json-filelist.txt using Spark
Property svn:executable set to ``*
File size: 1.4 KB

Rev	Line
[32106]	1	package org.hathitrust.extractedfeatures;
	2
	3	import java.io.IOException;
	4	import java.util.ArrayList;
	5	import java.util.HashMap;
	6	import java.util.Iterator;
	7
	8	import org.apache.hadoop.io.Text;
	9	import org.apache.spark.api.java.function.FlatMapFunction;
	10	import org.apache.spark.api.java.function.Function;
	11	import org.apache.spark.api.java.function.VoidFunction;
	12	import org.apache.spark.util.DoubleAccumulator;
	13	import org.json.JSONArray;
	14	import org.json.JSONObject;
	15
	16	public class PerVolumeJSONList implements Function<String,Integer>
	17	{
	18	private static final long serialVersionUID = 1L;
	19	protected PerVolumeUtil _per_vol_util;
	20
	21	public PerVolumeJSONList(String input_dir, String whitelist_filename, String langmap_directory,
	22	ArrayList<String> solr_endpoints, String output_dir, int verbosity,
	23	boolean icu_tokenize, boolean strict_file_io)
	24	{
	25	_per_vol_util = new PerVolumeUtil(input_dir, whitelist_filename, langmap_directory,
	26	solr_endpoints, output_dir, verbosity,
	27	icu_tokenize, strict_file_io);
	28
	29	}
	30
	31	public Integer call(String json_file_in) throws IOException
	32	{
	33	// Read in JSON file as Text
	34	String full_json_file_in = _per_vol_util.getInputDir() + "/" + json_file_in;
	35	String json_content_string = ClusterFileIO.readTextFile(full_json_file_in);
	36
	37	Text json_content_text = new Text(json_content_string);
	38
	39	return _per_vol_util.call(json_content_text);
	40	}
	41	}
	42

Note: See TracBrowser for help on using the repository browser.