Context Navigation

ProcessForSerialSolrIngest.java@ 32103

Last change on this file since 32103 was 32103, checked in by davidb, 6 years ago
Tidy up of output
Property svn:executable set to ``*
File size: 8.8 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.BufferedInputStream;
4	import java.io.File;
5	import java.io.FileInputStream;
6	import java.io.FileNotFoundException;
7	import java.io.IOException;
8	import java.nio.file.Files;
9	import java.nio.file.Path;
10	import java.nio.file.Paths;
11	import java.util.ArrayList;
12	import java.util.stream.Stream;
13
14	import org.apache.commons.cli.*;
15	import org.apache.hadoop.io.Text;
16
17	public class ProcessForSerialSolrIngest
18	{
19	//private static final long serialVersionUID = 1L;
20
21	protected String _input_file;
22	protected String _solr_base_url;
23	protected String _solr_collection;
24
25	protected String _whitelist_filename;
26	protected String _langmap_directory;
27
28	//protected String _solr_url;
29	protected String _output_dir;
30
31	protected int _verbosity;
32
33	public ProcessForSerialSolrIngest(String input_file, String solr_collection,
34	String solr_base_url, String output_dir, int verbosity)
35	{
36	_input_file = input_file;
37	_solr_collection = solr_collection;
38
39	boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
40	_whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
41
42	boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap");
43	_langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null;
44
45
46	_solr_base_url = solr_base_url;
47	_output_dir = output_dir;
48	_verbosity = verbosity;
49	}
50
51	protected String generateAppName()
52	{
53	String app_name = "Extract Features: Process for Serial Solr Ingest";
54	app_name += " [" + _solr_collection + "]";
55
56	if (_solr_base_url != null) {
57	app_name += " solr_base_url=" + _solr_base_url;
58	}
59
60	if (_output_dir != null) {
61	app_name += " output_dir=" + _output_dir;
62	}
63
64	return app_name;
65	}
66
67	public ArrayList<String> extrapolateSolrEndpoints(String solr_collection)
68	{
69	ArrayList<String> solr_endpoints = new ArrayList<String>();
70
71	if (_solr_base_url != null) {
72	String solr_url = _solr_base_url + "/" + solr_collection + "/update";
73
74	String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null);
75	if (solr_cloud_nodes != null) {
76	String [] cloud_nodes = solr_cloud_nodes.split(",");
77	for (String cn : cloud_nodes) {
78	String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");
79	solr_endpoints.add(solr_endpoint);
80	}
81	}
82	else {
83	solr_endpoints.add(solr_url);
84	}
85	}
86
87	return solr_endpoints;
88	}
89
90	public ArrayList<String> readFileLines(Path list_path)
91	{
92	ArrayList<String> json_file_list = new ArrayList<String>();
93
94	try (Stream<String> list_lines = Files.lines(list_path)) {
95	list_lines.forEach(line -> {
96	json_file_list.add(line);
97	});
98	} catch (IOException e) {
99	e.printStackTrace();
100	}
101
102	return json_file_list;
103
104	}
105
106	public Text readJSONText(Path json_path)
107	{
108	File json_file = json_path.toFile();
109
110	String json_filename = json_file.toURI().toString();
111	/*
112	try {
113	json_filename = json_file.getCanonicalPath();
114	}
115	catch (Exception e) {
116	e.printStackTrace();
117	}
118	*/
119	String text_string = ClusterFileIO.readTextFile(json_filename);
120
121	//ArrayList<String> text_lines = readFileLines(json_path);
122
123	//String text_string = String.join("\n",text_lines);
124
125	Text json_text = new Text(text_string);
126	return json_text;
127
128	}
129
130	public void execPerVolumeSequenceFile()
131	{
132	String serial_app_name = generateAppName();
133	System.out.println(serial_app_name);
134
135	Path json_filelist_path = Paths.get(_input_file);
136
137	// Read in text file
138	ArrayList<String> json_file_list = readFileLines(json_filelist_path);
139
140	boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
141	boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
142
143	ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection);
144
145	System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory);
146	PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_file,_whitelist_filename, _langmap_directory,
147	solr_endpoints,_output_dir,_verbosity,
148	icu_tokenize,strict_file_io);
149
150	// Foreach file, call per_vol_json.call()
151	long num_vol_ids = 0;
152	long json_file_list_len = json_file_list.size();
153	for (String json_filename : json_file_list) {
154	//Path json_path = Paths.get("file://D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/json-files",json_filename);
155	Path json_path = Paths.get("json-files",json_filename);
156
157
158	System.out.println("Processing jsonfile: " + json_path);
159	Text json_text = readJSONText(json_path);
160	try {
161	per_vol_json.call(json_text);
162	} catch (IOException e) {
163	e.printStackTrace();
164	}
165	num_vol_ids++;
166	System.out.println("+ Processed " + num_vol_ids + "/" + json_file_list_len);
167	}
168
169
170	System.out.println("");
171	System.out.println("############");
172	System.out.println("# Number of volume ids: " + num_vol_ids);
173	System.out.println("############");
174	System.out.println("");
175
176
177
178
179	}
180
181
182	public static void print_usage(HelpFormatter formatter, Options options)
183	{
184	formatter.printHelp("RUN.bash [options] input-file solr-collection", options);
185	}
186
187	public static void main(String[] args) {
188	Options options = new Options();
189
190	Option verbosity_opt = new Option("v", "verbosity", true,
191	"Set to control the level of debugging output [0=none, 1=some, 2=lots]");
192	verbosity_opt.setRequired(false);
193	options.addOption(verbosity_opt);
194
195	Option properties_opt = new Option("p", "properties", true,
196	"Read in the specified Java properties file");
197	properties_opt.setRequired(false);
198	options.addOption(properties_opt);
199
200	Option output_dir_opt = new Option("o", "output-dir", true,
201	"If specified, save BZipped Solr JSON files to this directory");
202	output_dir_opt.setRequired(false);
203	options.addOption(output_dir_opt);
204
205	Option solr_base_url_opt = new Option("u", "solr-base-url", true,
206	"If specified, the base URL to post the Solr JSON data to");
207	solr_base_url_opt.setRequired(false);
208	options.addOption(solr_base_url_opt);
209
210	Option read_only_opt = new Option("r", "read-only", false,
211	"Used to initiate a run where the files are all read in, but nothing is ingested/saved");
212	read_only_opt.setRequired(false);
213	options.addOption(read_only_opt);
214
215	// Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
216	CommandLineParser parser = new GnuParser();
217	//CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
218
219	HelpFormatter formatter = new HelpFormatter();
220	CommandLine cmd = null;
221
222	try {
223	cmd = parser.parse(options, args);
224	}
225	catch (ParseException e) {
226	System.err.println(e.getMessage());
227	print_usage(formatter,options);
228	System.exit(1);
229	}
230
231
232	String verbosity_str = cmd.getOptionValue("verbosity","1");
233	int verbosity = Integer.parseInt(verbosity_str);
234
235	String property_filename = cmd.getOptionValue("properties",null);
236
237	String output_dir = cmd.getOptionValue("output-dir",null);
238	String solr_base_url = cmd.getOptionValue("solr-base-url",null);
239	boolean read_only = cmd.hasOption("read-only");
240
241	String[] filtered_args = cmd.getArgs();
242
243	if (filtered_args.length != 2) {
244	print_usage(formatter,options);
245	System.exit(1);
246	}
247
248	if (property_filename != null) {
249	try {
250	FileInputStream fis = new FileInputStream(property_filename);
251	BufferedInputStream bis = new BufferedInputStream(fis);
252
253	System.getProperties().load(bis);
254	}
255	catch (FileNotFoundException e) {
256	// TODO Auto-generated catch block
257	e.printStackTrace();
258	System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
259	}
260	catch (IOException e) {
261	System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
262	}
263	}
264
265	if (!read_only && ((output_dir == null) && (solr_base_url==null))) {
266	System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved");
267	print_usage(formatter,options);
268	System.exit(1);
269	}
270	if (read_only) {
271	// For this case, need to ensure solr-url and output-dir are null
272	output_dir = null;
273	solr_base_url = null;
274	}
275
276	String input_file = filtered_args[0];
277	String solr_collection = filtered_args[1];
278
279	ProcessForSerialSolrIngest prep_for_ingest
280	= new ProcessForSerialSolrIngest(input_file,solr_collection,solr_base_url,output_dir,verbosity);
281
282	prep_for_ingest.execPerVolumeSequenceFile();
283
284	}
285	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSerialSolrIngest.java@ 32103

Download in other formats: