Context Navigation

ProcessForSerialSolrIngest.java@ 32103

Last change on this file since 32103 was 32103, checked in by davidb, 6 years ago
Tidy up of output
Property svn:executable set to ``*
File size: 8.8 KB

Rev	Line
[32101]	1	package org.hathitrust.extractedfeatures;
	2
	3	import java.io.BufferedInputStream;
	4	import java.io.File;
	5	import java.io.FileInputStream;
	6	import java.io.FileNotFoundException;
	7	import java.io.IOException;
	8	import java.nio.file.Files;
	9	import java.nio.file.Path;
	10	import java.nio.file.Paths;
	11	import java.util.ArrayList;
	12	import java.util.stream.Stream;
	13
	14	import org.apache.commons.cli.*;
	15	import org.apache.hadoop.io.Text;
	16
	17	public class ProcessForSerialSolrIngest
	18	{
	19	//private static final long serialVersionUID = 1L;
	20
	21	protected String _input_file;
	22	protected String _solr_base_url;
	23	protected String _solr_collection;
	24
	25	protected String _whitelist_filename;
	26	protected String _langmap_directory;
	27
	28	//protected String _solr_url;
	29	protected String _output_dir;
	30
	31	protected int _verbosity;
	32
	33	public ProcessForSerialSolrIngest(String input_file, String solr_collection,
	34	String solr_base_url, String output_dir, int verbosity)
	35	{
	36	_input_file = input_file;
	37	_solr_collection = solr_collection;
	38
	39	boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
	40	_whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
	41
	42	boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap");
	43	_langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null;
	44
	45
	46	_solr_base_url = solr_base_url;
	47	_output_dir = output_dir;
	48	_verbosity = verbosity;
	49	}
	50
	51	protected String generateAppName()
	52	{
	53	String app_name = "Extract Features: Process for Serial Solr Ingest";
	54	app_name += " [" + _solr_collection + "]";
	55
	56	if (_solr_base_url != null) {
	57	app_name += " solr_base_url=" + _solr_base_url;
	58	}
	59
	60	if (_output_dir != null) {
	61	app_name += " output_dir=" + _output_dir;
	62	}
	63
	64	return app_name;
	65	}
	66
	67	public ArrayList<String> extrapolateSolrEndpoints(String solr_collection)
	68	{
	69	ArrayList<String> solr_endpoints = new ArrayList<String>();
	70
	71	if (_solr_base_url != null) {
	72	String solr_url = _solr_base_url + "/" + solr_collection + "/update";
	73
	74	String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null);
	75	if (solr_cloud_nodes != null) {
	76	String [] cloud_nodes = solr_cloud_nodes.split(",");
	77	for (String cn : cloud_nodes) {
	78	String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");
	79	solr_endpoints.add(solr_endpoint);
	80	}
	81	}
	82	else {
	83	solr_endpoints.add(solr_url);
	84	}
	85	}
	86
	87	return solr_endpoints;
	88	}
	89
	90	public ArrayList<String> readFileLines(Path list_path)
	91	{
	92	ArrayList<String> json_file_list = new ArrayList<String>();
	93
	94	try (Stream<String> list_lines = Files.lines(list_path)) {
	95	list_lines.forEach(line -> {
	96	json_file_list.add(line);
	97	});
	98	} catch (IOException e) {
	99	e.printStackTrace();
	100	}
	101
	102	return json_file_list;
	103
	104	}
	105
	106	public Text readJSONText(Path json_path)
	107	{
	108	File json_file = json_path.toFile();
	109
	110	String json_filename = json_file.toURI().toString();
	111	/*
	112	try {
	113	json_filename = json_file.getCanonicalPath();
	114	}
	115	catch (Exception e) {
	116	e.printStackTrace();
	117	}
	118	*/
	119	String text_string = ClusterFileIO.readTextFile(json_filename);
	120
	121	//ArrayList<String> text_lines = readFileLines(json_path);
	122
	123	//String text_string = String.join("\n",text_lines);
	124
	125	Text json_text = new Text(text_string);
	126	return json_text;
	127
	128	}
	129
	130	public void execPerVolumeSequenceFile()
	131	{
	132	String serial_app_name = generateAppName();
	133	System.out.println(serial_app_name);
	134
	135	Path json_filelist_path = Paths.get(_input_file);
	136
	137	// Read in text file
	138	ArrayList<String> json_file_list = readFileLines(json_filelist_path);
	139
	140	boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
	141	boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
	142
	143	ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection);
	144
	145	System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory);
	146	PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_file,_whitelist_filename, _langmap_directory,
	147	solr_endpoints,_output_dir,_verbosity,
	148	icu_tokenize,strict_file_io);
	149
	150	// Foreach file, call per_vol_json.call()
	151	long num_vol_ids = 0;
[32103]	152	long json_file_list_len = json_file_list.size();
[32101]	153	for (String json_filename : json_file_list) {
	154	//Path json_path = Paths.get("file://D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/json-files",json_filename);
	155	Path json_path = Paths.get("json-files",json_filename);
	156
	157
[32103]	158	System.out.println("Processing jsonfile: " + json_path);
[32101]	159	Text json_text = readJSONText(json_path);
	160	try {
	161	per_vol_json.call(json_text);
	162	} catch (IOException e) {
	163	e.printStackTrace();
	164	}
	165	num_vol_ids++;
[32103]	166	System.out.println("+ Processed " + num_vol_ids + "/" + json_file_list_len);
[32101]	167	}
	168
	169
	170	System.out.println("");
	171	System.out.println("############");
	172	System.out.println("# Number of volume ids: " + num_vol_ids);
	173	System.out.println("############");
	174	System.out.println("");
	175
	176
	177
	178
	179	}
	180
	181
	182	public static void print_usage(HelpFormatter formatter, Options options)
	183	{
	184	formatter.printHelp("RUN.bash [options] input-file solr-collection", options);
	185	}
	186
	187	public static void main(String[] args) {
	188	Options options = new Options();
	189
	190	Option verbosity_opt = new Option("v", "verbosity", true,
	191	"Set to control the level of debugging output [0=none, 1=some, 2=lots]");
	192	verbosity_opt.setRequired(false);
	193	options.addOption(verbosity_opt);
	194
	195	Option properties_opt = new Option("p", "properties", true,
	196	"Read in the specified Java properties file");
	197	properties_opt.setRequired(false);
	198	options.addOption(properties_opt);
	199
	200	Option output_dir_opt = new Option("o", "output-dir", true,
	201	"If specified, save BZipped Solr JSON files to this directory");
	202	output_dir_opt.setRequired(false);
	203	options.addOption(output_dir_opt);
	204
	205	Option solr_base_url_opt = new Option("u", "solr-base-url", true,
	206	"If specified, the base URL to post the Solr JSON data to");
	207	solr_base_url_opt.setRequired(false);
	208	options.addOption(solr_base_url_opt);
	209
	210	Option read_only_opt = new Option("r", "read-only", false,
	211	"Used to initiate a run where the files are all read in, but nothing is ingested/saved");
	212	read_only_opt.setRequired(false);
	213	options.addOption(read_only_opt);
	214
	215	// Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
	216	CommandLineParser parser = new GnuParser();
	217	//CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
	218
	219	HelpFormatter formatter = new HelpFormatter();
	220	CommandLine cmd = null;
	221
	222	try {
	223	cmd = parser.parse(options, args);
	224	}
	225	catch (ParseException e) {
	226	System.err.println(e.getMessage());
	227	print_usage(formatter,options);
	228	System.exit(1);
	229	}
	230
	231
	232	String verbosity_str = cmd.getOptionValue("verbosity","1");
	233	int verbosity = Integer.parseInt(verbosity_str);
	234
	235	String property_filename = cmd.getOptionValue("properties",null);
	236
	237	String output_dir = cmd.getOptionValue("output-dir",null);
	238	String solr_base_url = cmd.getOptionValue("solr-base-url",null);
	239	boolean read_only = cmd.hasOption("read-only");
	240
	241	String[] filtered_args = cmd.getArgs();
	242
	243	if (filtered_args.length != 2) {
	244	print_usage(formatter,options);
	245	System.exit(1);
	246	}
	247
	248	if (property_filename != null) {
	249	try {
	250	FileInputStream fis = new FileInputStream(property_filename);
	251	BufferedInputStream bis = new BufferedInputStream(fis);
	252
	253	System.getProperties().load(bis);
	254	}
	255	catch (FileNotFoundException e) {
	256	// TODO Auto-generated catch block
	257	e.printStackTrace();
	258	System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
	259	}
	260	catch (IOException e) {
	261	System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
	262	}
	263	}
	264
	265	if (!read_only && ((output_dir == null) && (solr_base_url==null))) {
	266	System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved");
	267	print_usage(formatter,options);
	268	System.exit(1);
	269	}
	270	if (read_only) {
	271	// For this case, need to ensure solr-url and output-dir are null
	272	output_dir = null;
	273	solr_base_url = null;
	274	}
	275
	276	String input_file = filtered_args[0];
	277	String solr_collection = filtered_args[1];
	278
	279	ProcessForSerialSolrIngest prep_for_ingest
	280	= new ProcessForSerialSolrIngest(input_file,solr_collection,solr_base_url,output_dir,verbosity);
	281
	282	prep_for_ingest.execPerVolumeSequenceFile();
	283
	284	}
	285	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSerialSolrIngest.java@ 32103

Download in other formats: