Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSerialSolrIngest.java@ 32101

Last change on this file since 32101 was 32101, checked in by davidb, 6 years ago
Tweaks to allow serial ingest to run
Property svn:executable set to ``*
File size: 8.7 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedInputStream;
4	import java.io.File;
5	import java.io.FileInputStream;
6	import java.io.FileNotFoundException;
7	import java.io.IOException;
8	import java.nio.file.Files;
9	import java.nio.file.Path;
10	import java.nio.file.Paths;
11	import java.util.ArrayList;
12	import java.util.stream.Stream;
13
14	import org.apache.commons.cli.*;
15	import org.apache.hadoop.io.Text;
16
17	public class ProcessForSerialSolrIngest
18	{
19	//private static final long serialVersionUID = 1L;
20
21	protected String _input_file;
22	protected String _solr_base_url;
23	protected String _solr_collection;
24
25	protected String _whitelist_filename;
26	protected String _langmap_directory;
27
28	//protected String _solr_url;
29	protected String _output_dir;
30
31	protected int _verbosity;
32
33	public ProcessForSerialSolrIngest(String input_file, String solr_collection,
34	String solr_base_url, String output_dir, int verbosity)
35	{
36	_input_file = input_file;
37	_solr_collection = solr_collection;
38
39	boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
40	_whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
41
42	boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap");
43	_langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null;
44
45
46	_solr_base_url = solr_base_url;
47	_output_dir = output_dir;
48	_verbosity = verbosity;
49	}
50
51	protected String generateAppName()
52	{
53	String app_name = "Extract Features: Process for Serial Solr Ingest";
54	app_name += " [" + _solr_collection + "]";
55
56	if (_solr_base_url != null) {
57	app_name += " solr_base_url=" + _solr_base_url;
58	}
59
60	if (_output_dir != null) {
61	app_name += " output_dir=" + _output_dir;
62	}
63
64	return app_name;
65	}
66
67	public ArrayList<String> extrapolateSolrEndpoints(String solr_collection)
68	{
69	ArrayList<String> solr_endpoints = new ArrayList<String>();
70
71	if (_solr_base_url != null) {
72	String solr_url = _solr_base_url + "/" + solr_collection + "/update";
73
74	String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null);
75	if (solr_cloud_nodes != null) {
76	String [] cloud_nodes = solr_cloud_nodes.split(",");
77	for (String cn : cloud_nodes) {
78	String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");
79	solr_endpoints.add(solr_endpoint);
80	}
81	}
82	else {
83	solr_endpoints.add(solr_url);
84	}
85	}
86
87	return solr_endpoints;
88	}
89
90	public ArrayList<String> readFileLines(Path list_path)
91	{
92	ArrayList<String> json_file_list = new ArrayList<String>();
93
94	try (Stream<String> list_lines = Files.lines(list_path)) {
95	list_lines.forEach(line -> {
96	json_file_list.add(line);
97	});
98	} catch (IOException e) {
99	e.printStackTrace();
100	}
101
102	return json_file_list;
103
104	}
105
106	public Text readJSONText(Path json_path)
107	{
108	File json_file = json_path.toFile();
109
110	String json_filename = json_file.toURI().toString();
111	/*
112	try {
113	json_filename = json_file.getCanonicalPath();
114	}
115	catch (Exception e) {
116	e.printStackTrace();
117	}
118	*/
119	String text_string = ClusterFileIO.readTextFile(json_filename);
120
121	//ArrayList<String> text_lines = readFileLines(json_path);
122
123	//String text_string = String.join("\n",text_lines);
124
125	Text json_text = new Text(text_string);
126	return json_text;
127
128	}
129
130	public void execPerVolumeSequenceFile()
131	{
132	String serial_app_name = generateAppName();
133	System.out.println(serial_app_name);
134
135	Path json_filelist_path = Paths.get(_input_file);
136
137	// Read in text file
138	ArrayList<String> json_file_list = readFileLines(json_filelist_path);
139
140	boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
141	boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
142
143	ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection);
144
145	System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory);
146	PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_file,_whitelist_filename, _langmap_directory,
147	solr_endpoints,_output_dir,_verbosity,
148	icu_tokenize,strict_file_io);
149
150	// Foreach file, call per_vol_json.call()
151	long num_vol_ids = 0;
152	for (String json_filename : json_file_list) {
153	//Path json_path = Paths.get("file://D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/json-files",json_filename);
154	Path json_path = Paths.get("json-files",json_filename);
155
156
157	System.out.println("*** Processing jsonfile: " + json_path);
158	Text json_text = readJSONText(json_path);
159	try {
160	per_vol_json.call(json_text);
161	} catch (IOException e) {
162	e.printStackTrace();
163	}
164	num_vol_ids++;
165	}
166
167
168	System.out.println("");
169	System.out.println("############");
170	System.out.println("# Number of volume ids: " + num_vol_ids);
171	System.out.println("############");
172	System.out.println("");
173
174
175
176
177	}
178
179
180	public static void print_usage(HelpFormatter formatter, Options options)
181	{
182	formatter.printHelp("RUN.bash [options] input-file solr-collection", options);
183	}
184
185	public static void main(String[] args) {
186	Options options = new Options();
187
188	Option verbosity_opt = new Option("v", "verbosity", true,
189	"Set to control the level of debugging output [0=none, 1=some, 2=lots]");
190	verbosity_opt.setRequired(false);
191	options.addOption(verbosity_opt);
192
193	Option properties_opt = new Option("p", "properties", true,
194	"Read in the specified Java properties file");
195	properties_opt.setRequired(false);
196	options.addOption(properties_opt);
197
198	Option output_dir_opt = new Option("o", "output-dir", true,
199	"If specified, save BZipped Solr JSON files to this directory");
200	output_dir_opt.setRequired(false);
201	options.addOption(output_dir_opt);
202
203	Option solr_base_url_opt = new Option("u", "solr-base-url", true,
204	"If specified, the base URL to post the Solr JSON data to");
205	solr_base_url_opt.setRequired(false);
206	options.addOption(solr_base_url_opt);
207
208	Option read_only_opt = new Option("r", "read-only", false,
209	"Used to initiate a run where the files are all read in, but nothing is ingested/saved");
210	read_only_opt.setRequired(false);
211	options.addOption(read_only_opt);
212
213	// Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
214	CommandLineParser parser = new GnuParser();
215	//CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
216
217	HelpFormatter formatter = new HelpFormatter();
218	CommandLine cmd = null;
219
220	try {
221	cmd = parser.parse(options, args);
222	}
223	catch (ParseException e) {
224	System.err.println(e.getMessage());
225	print_usage(formatter,options);
226	System.exit(1);
227	}
228
229
230	String verbosity_str = cmd.getOptionValue("verbosity","1");
231	int verbosity = Integer.parseInt(verbosity_str);
232
233	String property_filename = cmd.getOptionValue("properties",null);
234
235	String output_dir = cmd.getOptionValue("output-dir",null);
236	String solr_base_url = cmd.getOptionValue("solr-base-url",null);
237	boolean read_only = cmd.hasOption("read-only");
238
239	String[] filtered_args = cmd.getArgs();
240
241	if (filtered_args.length != 2) {
242	print_usage(formatter,options);
243	System.exit(1);
244	}
245
246	if (property_filename != null) {
247	try {
248	FileInputStream fis = new FileInputStream(property_filename);
249	BufferedInputStream bis = new BufferedInputStream(fis);
250
251	System.getProperties().load(bis);
252	}
253	catch (FileNotFoundException e) {
254	// TODO Auto-generated catch block
255	e.printStackTrace();
256	System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
257	}
258	catch (IOException e) {
259	System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
260	}
261	}
262
263	if (!read_only && ((output_dir == null) && (solr_base_url==null))) {
264	System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved");
265	print_usage(formatter,options);
266	System.exit(1);
267	}
268	if (read_only) {
269	// For this case, need to ensure solr-url and output-dir are null
270	output_dir = null;
271	solr_base_url = null;
272	}
273
274	String input_file = filtered_args[0];
275	String solr_collection = filtered_args[1];
276
277	ProcessForSerialSolrIngest prep_for_ingest
278	= new ProcessForSerialSolrIngest(input_file,solr_collection,solr_base_url,output_dir,verbosity);
279
280	prep_for_ingest.execPerVolumeSequenceFile();
281
282	}
283	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: