Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerPageJSONFlatmap.java@ 31045

Last change on this file since 31045 was 31045, checked in by davidb, 7 years ago
More careful treatment of what to do when a JSON file isn't there
Property svn:executable set to ``*
File size: 4.2 KB

Rev	Line
[31007]	1	package org.hathitrust.extractedfeatures;
	2
[31045]	3	import java.io.IOException;
[31007]	4	import java.util.ArrayList;
	5	import java.util.Iterator;
	6	import org.apache.spark.api.java.function.FlatMapFunction;
	7	import org.apache.spark.util.DoubleAccumulator;
	8	import org.json.JSONArray;
	9	import org.json.JSONObject;
	10
	11	/*
	12	class PagedJSON implements Function<String, Boolean> {
	13
	14	private static final long serialVersionUID = 1L;
	15
	16	public Boolean call(String s) { return s.contains("a"); }
	17	}
	18	*/
	19
	20
	21	class PerPageJSONFlatmap implements FlatMapFunction<String, JSONObject>
	22	//public class PagedJSON implements VoidFunction<String>
	23	{
	24	private static final long serialVersionUID = 1L;
	25
	26	protected String _input_dir;
	27	protected String _solr_url;
	28	protected String _output_dir;
	29	protected int _verbosity;
	30
	31	protected DoubleAccumulator _progress_accum;
	32	protected double _progress_step;
	33
[31045]	34	boolean _strict_file_io;
	35
[31007]	36	public PerPageJSONFlatmap(String input_dir, String solr_url, String output_dir, int verbosity,
[31045]	37	DoubleAccumulator progress_accum, double progress_step,
	38	boolean strict_file_io)
[31007]	39	{
	40	_input_dir = input_dir;
	41	_solr_url = solr_url;
	42	_output_dir = output_dir;
	43	_verbosity = verbosity;
	44
	45	_progress_accum = progress_accum;
	46	_progress_step = progress_step;
[31045]	47
	48	_strict_file_io = strict_file_io;
[31007]	49	}
	50
[31045]	51	public Iterator<JSONObject> call(String json_file_in) throws IOException
[31007]	52	//public void call(String json_file_in)
	53	{
[31045]	54	String full_json_file_in = _input_dir + "/" + json_file_in;
	55	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
[31007]	56
[31045]	57	ArrayList<JSONObject> json_pages = new ArrayList<JSONObject>();
[31007]	58
[31045]	59	if (extracted_feature_record != null) {
	60	String volume_id = extracted_feature_record.getString("id");
	61
	62	//JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
	63	//String title= ef_metadata.getString("title");
	64
	65	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
	66
	67
	68	int ef_page_count = ef_features.getInt("pageCount");
	69
	70	if (_verbosity >= 1) {
	71	System.out.println("Processing: " + json_file_in);
	72	System.out.println(" pageCount = " + ef_page_count);
[31030]	73	}
[31045]	74
	75	JSONArray ef_pages = ef_features.getJSONArray("pages");
	76	int ef_num_pages = ef_pages.length();
	77	if (ef_num_pages != ef_page_count) {
	78	System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
	79	+" does not match 'pageCount' metadata (" + ef_page_count + ")");
	80	}
	81
	82	// Make directory for page-level JSON output
	83	String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
	84	String page_json_dir = json_dir + "/pages";
	85	ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
	86
[31007]	87	if (_verbosity >= 2) {
[31045]	88	System.out.print(" Pages: ");
[31007]	89	}
[31045]	90
	91	for (int i = 0; i < ef_page_count; i++) {
	92	String formatted_i = String.format("page-%06d", i);
	93	String page_id = volume_id + "." + formatted_i;
	94
[31030]	95	if (_verbosity >= 2) {
[31045]	96	if (i>0) {
	97	System.out.print(", ");
	98	}
	99	System.out.print(page_id);
[31030]	100	}
[31045]	101
	102	String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
[31007]	103
[31045]	104	if (i==(ef_page_count-1)) {
	105	if (_verbosity >= 2) {
	106	System.out.println();
	107	}
	108	System.out.println("Sample output JSON page file: " + output_json_bz2);
	109	}
	110
	111	JSONObject ef_page = ef_pages.getJSONObject(i);
	112
	113	if (ef_page != null) {
	114	// Convert to Solr add form
	115	JSONObject solr_add_doc_json = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page);
	116	solr_add_doc_json.put("filename_json_bz2", output_json_bz2);
	117
	118	json_pages.add(solr_add_doc_json);
	119
	120
	121	}
	122	else {
	123	System.err.println("Skipping: " + page_id);
	124	}
	125
[31007]	126	}
[31045]	127	}
	128	else {
	129	// File did not exist, or could not be parsed
	130	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
	131	if (_strict_file_io) {
	132	throw new IOException(mess);
	133	}
[31007]	134	else {
[31045]	135	System.err.println("Warning: " + mess);
	136	System.out.println("Warning: " + mess);
[31007]	137	}
	138	}
[31045]	139
[31007]	140	_progress_accum.add(_progress_step);
	141
	142	return json_pages.iterator();
	143	}
	144
	145
	146	}
	147

Note: See TracBrowser for help on using the repository browser.

Download in other formats: