Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java@ 31026

Last change on this file since 31026 was 31026, checked in by davidb, 7 years ago
Corrected flag setting
Property svn:executable set to ``*
File size: 9.2 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedInputStream;
4	import java.io.FileInputStream;
5	import java.io.FileNotFoundException;
6	import java.io.IOException;
7	import java.io.Serializable;
8	import org.apache.commons.cli.*;
9
10	import org.apache.spark.api.java.*;
11	import org.apache.spark.util.DoubleAccumulator;
12	import org.hathitrust.extractedfeatures.PerPageJSONFlatmap;
13	import org.json.JSONObject;
14	import org.apache.spark.SparkConf;
15
16	public class ProcessForSolrIngest implements Serializable
17	{
18	private static final long serialVersionUID = 1L;
19
20	// Following details on number of partitions to use given in
21	// "Parallelized collections" section of:
22	// https://spark.apache.org/docs/2.0.1/programming-guide.html
23	//
24	// For a more detailed discussion see:
25	// http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
26
27	public static final int NUM_CORES = 6;
28	public static final int NUM_PARTITIONS = 2*NUM_CORES; // default would appear to be 2
29
30	protected String _input_dir;
31	protected String _json_list_filename;
32	protected String _solr_url;
33	protected String _output_dir;
34
35	protected int _verbosity;
36
37	public ProcessForSolrIngest(String input_dir, String json_list_filename,
38	String solr_url, String output_dir, int verbosity)
39	{
40	_input_dir = input_dir;
41	_json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
42
43	_solr_url = solr_url;
44	_output_dir = output_dir;
45	_verbosity = verbosity;
46	}
47
48	protected String generateSparkAppName(String exec_mode)
49	{
50	String spark_app_name = "[" + exec_mode + "] Extract Features: Process for Solr Ingest";
51	spark_app_name += " [" + _json_list_filename + "]";
52
53	if (_solr_url != null) {
54	spark_app_name += " solr_url=" + _solr_url;
55	}
56
57	if (_output_dir != null) {
58	spark_app_name += " output_dir=" + _output_dir;
59	}
60
61	return spark_app_name;
62	}
63
64	public void execPerVolume()
65	{
66	String spark_app_name = generateSparkAppName("Per Volume");
67
68	SparkConf conf = new SparkConf().setAppName(spark_app_name);
69	JavaSparkContext jsc = new JavaSparkContext(conf);
70
71	if (_verbosity >= 2) {
72	System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
73	System.out.println("Default Parallelism: " + jsc.defaultParallelism());
74	}
75
76	JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
77
78	long num_volumes = json_list_data.count();
79	double per_vol = 100.0/(double)num_volumes;
80
81	DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
82
83	System.err.println();
84	System.err.println();
85	System.err.println();
86	System.err.println("****##### _input_dir = " + _input_dir);
87	System.err.println();
88	System.err.println();
89	System.err.println();
90
91	PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
92
93	json_list_data.foreach(per_vol_json);
94
95	long num_ids = num_volumes;
96
97	System.out.println("");
98	System.out.println("############");
99	System.out.println("# Number of volume ids: " + num_ids);
100	System.out.println("############");
101	System.out.println("");
102
103	jsc.close();
104	}
105
106	public void execPerPage()
107	{
108	String spark_app_name = generateSparkAppName("Per Page");
109
110	SparkConf conf = new SparkConf().setAppName(spark_app_name);
111	JavaSparkContext jsc = new JavaSparkContext(conf);
112
113	if (_verbosity >= 2) {
114	System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
115	System.out.println("Default Parallelism: " + jsc.defaultParallelism());
116	}
117
118	JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
119
120	long num_volumes = json_list_data.count();
121	double per_vol = 100.0/(double)num_volumes;
122
123	DoubleAccumulator per_vol_progress_accum = jsc.sc().doubleAccumulator("Per Volume Progress Percent");
124
125	PerPageJSONFlatmap paged_solr_json_flatmap
126	= new PerPageJSONFlatmap(_input_dir,_solr_url,_output_dir,_verbosity, per_vol_progress_accum,per_vol);
127	JavaRDD<JSONObject> per_page_jsonobjects = json_list_data.flatMap(paged_solr_json_flatmap).cache();
128
129	//long num_page_ids = per_page_jsonobjects.count(); // trigger lazy eval of: flatmap:per-vol
130
131	DoubleAccumulator per_page_progress_accum = jsc.sc().doubleAccumulator("Pages Processed");
132
133	PerPageJSONMap paged_json_id_map
134	= new PerPageJSONMap(_input_dir,_solr_url,_output_dir,_verbosity, per_page_progress_accum,1.0);
135	JavaRDD<String> per_page_ids = per_page_jsonobjects.map(paged_json_id_map);
136
137	/*
138	System.out.println("");
139	System.out.println("############");
140	System.out.println("# Progress Accumulator: " + progress_accum.value());
141	System.out.println("############");
142	System.out.println("");
143	*/
144
145	long num_page_ids = per_page_ids.count(); // trigger lazy eval of: flatmap:per-vol -> map:per-page
146
147	System.out.println("");
148	System.out.println("############");
149	System.out.println("# Number of page ids: " + num_page_ids);
150	System.out.println("############");
151	System.out.println("");
152
153	/*
154	if (_output_dir != null) {
155	String rdd_save_file = "rdd-solr-json-page-files";
156	json_ids.saveAsTextFile(rdd_save_file);
157	System.out.println("############");
158	System.out.println("# Saved RDD of Solr JSON page files, top-level, as:");
159	System.out.println("# " + rdd_save_file);
160	System.out.println("############");
161	System.out.println("");
162	}
163	*/
164
165	jsc.close();
166	}
167
168
169
170
171	public static void print_usage(HelpFormatter formatter, Options options)
172	{
173	formatter.printHelp("RUN.bash [options] input-dir json-filelist.txt", options);
174	}
175
176	public static void main(String[] args) {
177	Options options = new Options();
178
179	Option verbosity_opt = new Option("v", "verbosity", true,
180	"Set to control the level of debugging output [0=none, 1=some, 2=lots]");
181	verbosity_opt.setRequired(false);
182	options.addOption(verbosity_opt);
183
184	Option properties_opt = new Option("p", "properties", true,
185	"Read in the specified Java properties file");
186	properties_opt.setRequired(false);
187	options.addOption(properties_opt);
188
189	Option output_dir_opt = new Option("o", "output-dir", true,
190	"If specified, save BZipped Solr JSON files to this directory");
191	output_dir_opt.setRequired(false);
192	options.addOption(output_dir_opt);
193
194	Option solr_url_opt = new Option("u", "solr-url", true,
195	"If specified, the URL to post the Solr JSON data to");
196	solr_url_opt.setRequired(false);
197	options.addOption(solr_url_opt);
198
199	Option read_only_opt = new Option("r", "read-only", false,
200	"Used to initiate a run where the files are all read in, but nothing is ingested/saved");
201	read_only_opt.setRequired(false);
202	options.addOption(read_only_opt);
203
204	// Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
205	CommandLineParser parser = new GnuParser();
206	//CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
207
208	HelpFormatter formatter = new HelpFormatter();
209	CommandLine cmd = null;
210
211	try {
212	cmd = parser.parse(options, args);
213	}
214	catch (ParseException e) {
215	System.err.println(e.getMessage());
216	print_usage(formatter,options);
217	System.exit(1);
218	}
219
220
221	String verbosity_str = cmd.getOptionValue("verbosity","0");
222	int verbosity = Integer.parseInt(verbosity_str);
223
224	String property_filename = cmd.getOptionValue("properties",null);
225
226	String output_dir = cmd.getOptionValue("output-dir",null);
227	String solr_url = cmd.getOptionValue("solr-url",null);
228	boolean read_only = cmd.hasOption("read-only");
229
230	String[] filtered_args = cmd.getArgs();
231
232	if (filtered_args.length != 2) {
233	print_usage(formatter,options);
234	System.exit(1);
235	}
236
237	if (property_filename != null) {
238	try {
239	FileInputStream fis = new FileInputStream(property_filename);
240	BufferedInputStream bis = new BufferedInputStream(fis);
241
242	System.getProperties().load(bis);
243	}
244	catch (FileNotFoundException e) {
245	// TODO Auto-generated catch block
246	e.printStackTrace();
247	System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
248	}
249	catch (IOException e) {
250	System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
251	}
252	}
253
254	if (!read_only && ((output_dir == null) && (solr_url==null))) {
255	System.err.println("Need to specify either --solr-url or --output-dir otherwise generated files are not ingested/saved");
256	print_usage(formatter,options);
257	System.exit(1);
258	}
259	if (read_only) {
260	// For this case, need to ensure solr-url and output-dir are null
261	output_dir = null;
262	solr_url = null;
263	}
264
265	String input_dir = filtered_args[0];
266	String json_list_filename = filtered_args[1];
267
268	ProcessForSolrIngest prep_for_ingest
269	= new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);
270
271	String process_json_mode = System.getProperty("solr-ingest.process-json-mode","per-page");
272	if (process_json_mode.equals("per-volume")) {
273	prep_for_ingest.execPerVolume();
274	}
275	else {
276	prep_for_ingest.execPerPage();
277	}
278	}
279	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: