Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java@ 31786

Last change on this file since 31786 was 31784, checked in by davidb, 7 years ago
Output to highlight skipping per-page indexing
Property svn:executable set to ``*
File size: 9.4 KB

Rev	Line
[31001]	1	package org.hathitrust.extractedfeatures;
	2
[31252]	3	import java.io.IOException;
[31266]	4	import java.util.ArrayList;
[31375]	5	import java.util.HashMap;
[31266]	6	import java.util.Iterator;
[31252]	7
[31372]	8	import org.apache.hadoop.io.Text;
[31266]	9	import org.apache.spark.api.java.function.FlatMapFunction;
[31372]	10	import org.apache.spark.api.java.function.Function;
[31001]	11	import org.apache.spark.api.java.function.VoidFunction;
	12	import org.apache.spark.util.DoubleAccumulator;
	13	import org.json.JSONArray;
	14	import org.json.JSONObject;
	15
	16	/*
	17	class PagedJSON implements Function<String, Boolean> {
	18
	19	private static final long serialVersionUID = 1L;
	20
	21	public Boolean call(String s) { return s.contains("a"); }
	22	}
	23	*/
	24
	25
[31266]	26	//public class PerVolumeJSON implements VoidFunction<String>
[31372]	27	public class PerVolumeJSON implements Function<Text,Integer>
[31001]	28	{
	29	private static final long serialVersionUID = 1L;
[31005]	30	protected String _input_dir;
[31225]	31	protected String _whitelist_filename;
[31375]	32	protected String _langmap_directory;
[31225]	33
[31451]	34	protected final ArrayList<String> _solr_endpoints;
	35	protected final int _solr_endpoints_len;
	36
	37	//protected String _solr_url;
[31005]	38	protected String _output_dir;
[31225]	39
[31005]	40	protected int _verbosity;
[31001]	41
[31225]	42	protected WhitelistBloomFilter _whitelist_bloomfilter;
[31375]	43	protected UniversalPOSLangMap _universal_langmap;
[31372]	44
[31375]	45	boolean _icu_tokenize;
	46	boolean _strict_file_io;
	47
[31450]	48	public PerVolumeJSON(String input_dir, String whitelist_filename, String langmap_directory,
[31451]	49	ArrayList<String> solr_endpoints, String output_dir, int verbosity,
[31252]	50	boolean icu_tokenize, boolean strict_file_io)
[31001]	51	{
[31450]	52	System.out.println("*** PerVolumeJSON Constructor, langmap_directory = " + langmap_directory);
	53
[31005]	54	_input_dir = input_dir;
[31225]	55	_whitelist_filename = whitelist_filename;
[31450]	56	_langmap_directory = langmap_directory;
[31220]	57
[31451]	58	_solr_endpoints = solr_endpoints;
	59	_solr_endpoints_len = solr_endpoints.size();
	60
	61	//_solr_url = solr_url;
[31005]	62	_output_dir = output_dir;
	63	_verbosity = verbosity;
	64
[31252]	65	_icu_tokenize = icu_tokenize;
	66	_strict_file_io = strict_file_io;
	67
[31225]	68	_whitelist_bloomfilter = null;
[31375]	69	_universal_langmap = null;
[31001]	70	}
[31005]	71
[31372]	72
	73	public Integer call(Text json_text) throws IOException
	74
	75	{
[31500]	76	if (_whitelist_filename != null) {
	77
	78	synchronized (_whitelist_filename) {
	79	if (_whitelist_bloomfilter == null) {
	80
	81	_whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
	82	}
	83	}
[31372]	84	}
[31375]	85
[31500]	86	if (_langmap_directory != null) {
	87
	88	synchronized (_langmap_directory) {
	89	if (_universal_langmap == null) {
	90	_universal_langmap = new UniversalPOSLangMap(_langmap_directory);
	91	}
	92	}
[31375]	93	}
[31500]	94
[31372]	95	int ef_num_pages = 0;
	96
[31451]	97	String solr_url = null;
	98	if (_solr_endpoints_len > 0) {
	99	int random_choice = (int)(_solr_endpoints_len * Math.random());
	100	solr_url = _solr_endpoints.get(random_choice);
	101	}
	102
[31372]	103	try {
	104
	105
	106	JSONObject extracted_feature_record = new JSONObject(json_text.toString());
	107
	108	if (extracted_feature_record != null) {
	109	String volume_id = extracted_feature_record.getString("id");
	110
[31505]	111	JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
[31372]	112	//String title= ef_metadata.getString("title");
	113
[31505]	114	//
	115	// Top-level metadata Solr doc
	116	//
	117	JSONObject solr_add_metadata_doc_json = SolrDocJSON.generateToplevelMetadataSolrDocJSON(volume_id,ef_metadata);
	118	if (solr_add_metadata_doc_json != null) {
	119
	120	if ((_verbosity >=2)) {
	121	System.out.println("==================");
	122	System.out.println("Metadata JSON: " + solr_add_metadata_doc_json.toString());
	123	System.out.println("==================");
	124	}
	125
	126	if (solr_url != null) {
	127
	128	if ((_verbosity >=2) ) {
	129	System.out.println("==================");
	130	System.out.println("Posting to: " + solr_url);
	131	System.out.println("==================");
	132	}
	133	SolrDocJSON.postSolrDoc(solr_url, solr_add_metadata_doc_json, volume_id, "top-level-metadata");
	134	}
	135	}
[31675]	136
[31505]	137	//
	138	// Now move on to POS extracted features per-page
	139	//
[31675]	140	boolean index_pages = true;
	141	if (index_pages) {
	142
	143	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
[31372]	144
[31675]	145	int ef_page_count = ef_features.getInt("pageCount");
[31372]	146
[31675]	147	if (_verbosity >= 1) {
	148	System.out.println("Processing: " + volume_id);
	149	System.out.println(" pageCount = " + ef_page_count);
	150	}
[31372]	151
[31675]	152	JSONArray ef_pages = ef_features.getJSONArray("pages");
	153	ef_num_pages = ef_pages.length();
[31372]	154
	155
[31675]	156	for (int i = 0; i < ef_page_count; i++) {
	157	String formatted_i = String.format("page-%06d", i);
	158	String page_id = volume_id + "." + formatted_i;
[31372]	159
[31675]	160	if (_verbosity >= 2) {
	161	System.out.println(" Page: " + page_id);
	162	}
[31372]	163
	164
[31675]	165	JSONObject ef_page = ef_pages.getJSONObject(i);
[31372]	166
[31675]	167	if (ef_page != null) {
	168	// Convert to Solr add form
	169	JSONObject solr_add_doc_json
[31783]	170	= SolrDocJSON.generateSolrDocJSON(volume_id, page_id,
	171	ef_metadata, ef_page,
	172	_whitelist_bloomfilter, _universal_langmap, _icu_tokenize);
[31372]	173
	174
[31675]	175	if ((_verbosity >=2) && (i==20)) {
	176	System.out.println("==================");
	177	System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
	178	System.out.println("==================");
	179	}
[31372]	180
	181
[31675]	182	if (solr_url != null) {
	183	SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json,
[31500]	184	volume_id, page_id);
[31675]	185	}
[31372]	186	}
[31675]	187	else {
	188	System.err.println("Skipping: " + page_id);
	189	}
	190
[31372]	191	}
	192	}
[31784]	193	else {
	194	System.err.println("Skipping per-page POS text indexing");
	195	}
	196
[31372]	197	}
	198	}
	199	catch (Exception e) {
	200	if (_strict_file_io) {
	201	throw e;
	202	}
	203	else {
	204	e.printStackTrace();
	205	}
	206	}
	207
	208	return ef_num_pages;
	209
	210	}
	211
	212	/*
[31266]	213	//public void call(String json_file_in) throws IOException
[31372]	214	public Integer call(String json_file_in) throws IOException
[31266]	215
[31001]	216	{
[31226]	217	if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
[31225]	218	_whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
	219	}
	220
[31372]	221	int ef_num_pages = 0;
	222
[31278]	223	ArrayList<String> ids = new ArrayList<String>(); // want it to be non-null so can return valid iterator
[31266]	224
[31252]	225	String full_json_file_in = _input_dir + "/" + json_file_in;
	226	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
[31001]	227
[31252]	228	if (extracted_feature_record != null) {
	229	String volume_id = extracted_feature_record.getString("id");
	230
	231	//JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
	232	//String title= ef_metadata.getString("title");
	233
	234	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
	235
	236	int ef_page_count = ef_features.getInt("pageCount");
	237
	238	if (_verbosity >= 1) {
	239	System.out.println("Processing: " + json_file_in);
	240	System.out.println(" pageCount = " + ef_page_count);
[31001]	241	}
[31252]	242
	243	JSONArray ef_pages = ef_features.getJSONArray("pages");
[31372]	244	ef_num_pages = ef_pages.length();
[31252]	245
	246	// Make directory for page-level JSON output
	247	String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
	248	String page_json_dir = json_dir + "/pages";
	249
	250	if (_output_dir != null) {
	251	ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
[31001]	252	}
[31278]	253
[31266]	254	ids = new ArrayList<String>(ef_num_pages);
[31252]	255	for (int i = 0; i < ef_page_count; i++) {
	256	String formatted_i = String.format("page-%06d", i);
	257	String page_id = volume_id + "." + formatted_i;
[31001]	258
[31252]	259	if (_verbosity >= 2) {
	260	System.out.println(" Page: " + page_id);
[31001]	261	}
[31252]	262
	263	String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
[31266]	264	ids.add(page_id);
[31252]	265
[31269]	266	if (_verbosity >=2) {
	267	if (i==0) {
	268	System.out.println("Sample output JSON page file [i=0]: " + output_json_bz2);
	269	}
[31001]	270	}
[31252]	271	JSONObject ef_page = ef_pages.getJSONObject(i);
	272
	273	if (ef_page != null) {
	274	// Convert to Solr add form
	275	JSONObject solr_add_doc_json
	276	= SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
	277
	278
[31001]	279	if ((_verbosity >=2) && (i==20)) {
	280	System.out.println("==================");
[31252]	281	System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
[31001]	282	System.out.println("==================");
	283	}
[31252]	284
	285
	286	if (_solr_url != null) {
	287	if ((_verbosity >=2) && (i==20)) {
	288	System.out.println("==================");
	289	System.out.println("Posting to: " + _solr_url);
	290	System.out.println("==================");
	291	}
	292	SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
	293	}
	294
	295	if (_output_dir != null) {
	296	if ((_verbosity >=2) && (i==20)) {
	297	System.out.println("==================");
	298	System.out.println("Saving to: " + _output_dir);
	299	System.out.println("==================");
	300	}
	301	SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
	302	}
[31001]	303	}
[31252]	304	else {
	305	System.err.println("Skipping: " + page_id);
	306	}
	307
[31001]	308	}
[31252]	309	}
	310	else {
	311	// File did not exist, or could not be parsed
	312	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
	313	if (_strict_file_io) {
	314	throw new IOException(mess);
	315	}
[31001]	316	else {
[31252]	317	System.err.println("Warning: " + mess);
	318	System.out.println("Warning: " + mess);
[31001]	319	}
	320	}
	321
[31372]	322	return ef_num_pages;
	323
[31001]	324	}
[31372]	325	*/
[31001]	326	}
	327

Note: See TracBrowser for help on using the repository browser.

Download in other formats: