Context Navigation

source: other-projects/hathitrust/solr-extracted-features/trunk/src/main/java/org/hathitrust/extractedfeatures/ProcessForSolrIngest.java@ 31006

Last change on this file since 31006 was 31006, checked in by davidb, 7 years ago
Further reversal of Base class. Switch to PerPage
Property svn:executable set to ``*
File size: 7.4 KB

Rev	Line
[30998]	1	package org.hathitrust.extractedfeatures;
	2
	3	import java.io.Serializable;
	4	import org.apache.commons.cli.*;
	5
	6	import org.apache.spark.api.java.*;
	7	import org.apache.spark.util.DoubleAccumulator;
	8	import org.hathitrust.extractedfeatures.PagedJSON;
[31001]	9	import org.json.JSONObject;
[30998]	10	import org.apache.spark.SparkConf;
	11
	12	public class ProcessForSolrIngest implements Serializable
	13	{
	14	private static final long serialVersionUID = 1L;
	15
	16	// Following details on number of partitions to use given in
	17	// "Parallelized collections" section of:
	18	// https://spark.apache.org/docs/2.0.1/programming-guide.html
	19	//
	20	// For a more detailed discussion see:
	21	// http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
	22
	23	public static final int NUM_CORES = 6;
	24	public static final int NUM_PARTITIONS = 2*NUM_CORES; // default would appear to be 2
	25
	26	protected String _input_dir;
	27	protected String _json_list_filename;
	28	protected String _solr_url;
	29	protected String _output_dir;
	30
	31	protected int _verbosity;
	32
	33	public ProcessForSolrIngest(String input_dir, String json_list_filename,
	34	String solr_url, String output_dir, int verbosity)
	35	{
	36	_input_dir = input_dir;
	37	_json_list_filename = (json_list_filename != null) ? json_list_filename : input_dir;
	38
	39	_solr_url = solr_url;
	40	_output_dir = output_dir;
	41	_verbosity = verbosity;
	42	}
	43
[31001]	44	public void execPerVolume()
[30998]	45	{
[31001]	46	String spark_app_name = "[Per Volume] Extract Features: Process for Solr Ingest";
[30998]	47	spark_app_name += " [" + _json_list_filename + "]";
	48
	49	SparkConf conf = new SparkConf().setAppName(spark_app_name);
	50	JavaSparkContext jsc = new JavaSparkContext(conf);
	51
	52	if (_verbosity >= 2) {
	53	System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
	54	System.out.println("Default Parallelism: " + jsc.defaultParallelism());
	55	}
	56
	57	JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
	58
	59	long num_volumes = json_list_data.count();
	60	double per_vol = 100.0/(double)num_volumes;
	61
	62	DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
	63
[31004]	64	System.err.println();
	65	System.err.println();
	66	System.err.println();
	67	System.err.println("****##### _input_dir = " + _input_dir);
	68	System.err.println();
	69	System.err.println();
	70	System.err.println();
	71
[31001]	72	PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
	73
	74	json_list_data.foreach(per_vol_json);
	75
	76	long num_ids = num_volumes;
	77
	78	System.out.println("");
	79	System.out.println("############");
	80	System.out.println("# Number of volume ids: " + num_ids);
	81	System.out.println("############");
	82	System.out.println("");
	83
	84	jsc.close();
	85	}
	86
	87	public void execPerPage()
	88	{
	89	String spark_app_name = "[Per Page] Extract Features: Process for Solr Ingest";
	90	spark_app_name += " [" + _json_list_filename + "]";
	91
	92	SparkConf conf = new SparkConf().setAppName(spark_app_name);
	93	JavaSparkContext jsc = new JavaSparkContext(conf);
	94
	95	if (_verbosity >= 2) {
	96	System.out.println("Default Minimum Partions: " + jsc.defaultMinPartitions());
	97	System.out.println("Default Parallelism: " + jsc.defaultParallelism());
	98	}
	99
	100	JavaRDD<String> json_list_data = jsc.textFile(_json_list_filename,NUM_PARTITIONS).cache();
	101
	102	long num_volumes = json_list_data.count();
	103	double per_vol = 100.0/(double)num_volumes;
	104
	105	DoubleAccumulator progress_accum = jsc.sc().doubleAccumulator("Progress Percent");
	106
[30998]	107	PagedJSON paged_json = new PagedJSON(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
[31001]	108	JavaRDD<JSONObject> json_per_page_ids = json_list_data.flatMap(paged_json).cache();
	109
[31002]	110	PagedJSONForeach paged_json_foreach = new PagedJSONForeach(_input_dir,_solr_url,_output_dir,_verbosity, progress_accum,per_vol);
	111	json_per_page_ids.foreach(paged_json_foreach);
[30998]	112
	113	/*
	114	System.out.println("");
	115	System.out.println("############");
	116	System.out.println("# Progress Accumulator: " + progress_accum.value());
	117	System.out.println("############");
	118	System.out.println("");
	119	*/
	120
[31001]	121	long num_page_ids = json_per_page_ids.count();
[30998]	122
	123	System.out.println("");
	124	System.out.println("############");
[31001]	125	System.out.println("# Number of page ids: " + num_page_ids);
[30998]	126	System.out.println("############");
	127	System.out.println("");
	128
	129	/*
	130	if (_output_dir != null) {
	131	String rdd_save_file = "rdd-solr-json-page-files";
	132	json_ids.saveAsTextFile(rdd_save_file);
	133	System.out.println("############");
	134	System.out.println("# Saved RDD of Solr JSON page files, top-level, as:");
	135	System.out.println("# " + rdd_save_file);
	136	System.out.println("############");
	137	System.out.println("");
	138	}
	139	*/
	140
	141	jsc.close();
	142	}
	143
[31001]	144
	145
	146
[30998]	147	public static void print_usage(HelpFormatter formatter, Options options)
	148	{
	149	formatter.printHelp("RUN.bash [options] input-dir json-filelist.txt", options);
	150	}
	151
	152	public static void main(String[] args) {
	153	Options options = new Options();
	154
	155	Option verbosity_opt = new Option("v", "verbosity", true,
	156	"Set to control the level of debugging output [0=none, 1=some, 2=lots]");
	157	verbosity_opt.setRequired(false);
	158	options.addOption(verbosity_opt);
	159
	160	Option output_dir_opt = new Option("o", "output-dir", true,
	161	"If specified, save BZipped Solr JSON files to this directory");
	162	output_dir_opt.setRequired(false);
	163	options.addOption(output_dir_opt);
	164
	165	Option solr_url_opt = new Option("u", "solr-url", true,
	166	"If specified, the URL to post the Solr JSON data to");
	167	solr_url_opt.setRequired(false);
	168	options.addOption(solr_url_opt);
	169
	170	Option read_only_opt = new Option("r", "read-only", false,
	171	"Used to initiate a run where the files are all read in, but nothing is ingested/saved");
	172	read_only_opt.setRequired(false);
	173	options.addOption(read_only_opt);
	174
	175	// Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
	176	CommandLineParser parser = new GnuParser();
	177	//CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
	178
	179	HelpFormatter formatter = new HelpFormatter();
	180	CommandLine cmd = null;
	181
	182	try {
	183	cmd = parser.parse(options, args);
	184	}
	185	catch (ParseException e) {
	186	System.err.println(e.getMessage());
	187	print_usage(formatter,options);
	188	System.exit(1);
	189	}
	190
	191
	192	String verbosity_str = cmd.getOptionValue("verbosity","0");
	193	int verbosity = Integer.parseInt(verbosity_str);
	194
	195	String output_dir = cmd.getOptionValue("output-dir",null);
	196	String solr_url = cmd.getOptionValue("solr-url",null);
	197	boolean read_only = cmd.hasOption("read-only");
	198
	199	String[] filtered_args = cmd.getArgs();
	200
	201	if (filtered_args.length != 2) {
	202	print_usage(formatter,options);
	203	System.exit(1);
	204	}
	205
	206	if (!read_only && ((output_dir == null) && (solr_url==null))) {
	207	System.err.println("Need to specify either --solr-url or --output-dir otherwise generated files are not ingested/saved");
	208	print_usage(formatter,options);
	209	System.exit(1);
	210	}
	211	if (read_only) {
	212	// For this case, need to ensure solr-url and output-dir are null
	213	output_dir = null;
	214	solr_url = null;
	215	}
	216
	217	String input_dir = filtered_args[0];
	218	String json_list_filename = filtered_args[1];
	219
	220	ProcessForSolrIngest prep_for_ingest
	221	= new ProcessForSolrIngest(input_dir,json_list_filename,solr_url,output_dir,verbosity);
[31006]	222
	223	//prep_for_ingest.execPerVolume();
	224	prep_for_ingest.execPerPage();
[30998]	225	}
	226	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: