source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSerialSolrIngest.java@ 32101

Last change on this file since 32101 was 32101, checked in by davidb, 6 years ago

Tweaks to allow serial ingest to run

  • Property svn:executable set to *
File size: 8.7 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.File;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.nio.file.Files;
9import java.nio.file.Path;
10import java.nio.file.Paths;
11import java.util.ArrayList;
12import java.util.stream.Stream;
13
14import org.apache.commons.cli.*;
15import org.apache.hadoop.io.Text;
16
17public class ProcessForSerialSolrIngest
18{
19 //private static final long serialVersionUID = 1L;
20
21 protected String _input_file;
22 protected String _solr_base_url;
23 protected String _solr_collection;
24
25 protected String _whitelist_filename;
26 protected String _langmap_directory;
27
28 //protected String _solr_url;
29 protected String _output_dir;
30
31 protected int _verbosity;
32
33 public ProcessForSerialSolrIngest(String input_file, String solr_collection,
34 String solr_base_url, String output_dir, int verbosity)
35 {
36 _input_file = input_file;
37 _solr_collection = solr_collection;
38
39 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
40 _whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
41
42 boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap");
43 _langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null;
44
45
46 _solr_base_url = solr_base_url;
47 _output_dir = output_dir;
48 _verbosity = verbosity;
49 }
50
51 protected String generateAppName()
52 {
53 String app_name = "Extract Features: Process for Serial Solr Ingest";
54 app_name += " [" + _solr_collection + "]";
55
56 if (_solr_base_url != null) {
57 app_name += " solr_base_url=" + _solr_base_url;
58 }
59
60 if (_output_dir != null) {
61 app_name += " output_dir=" + _output_dir;
62 }
63
64 return app_name;
65 }
66
67 public ArrayList<String> extrapolateSolrEndpoints(String solr_collection)
68 {
69 ArrayList<String> solr_endpoints = new ArrayList<String>();
70
71 if (_solr_base_url != null) {
72 String solr_url = _solr_base_url + "/" + solr_collection + "/update";
73
74 String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null);
75 if (solr_cloud_nodes != null) {
76 String [] cloud_nodes = solr_cloud_nodes.split(",");
77 for (String cn : cloud_nodes) {
78 String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");
79 solr_endpoints.add(solr_endpoint);
80 }
81 }
82 else {
83 solr_endpoints.add(solr_url);
84 }
85 }
86
87 return solr_endpoints;
88 }
89
90 public ArrayList<String> readFileLines(Path list_path)
91 {
92 ArrayList<String> json_file_list = new ArrayList<String>();
93
94 try (Stream<String> list_lines = Files.lines(list_path)) {
95 list_lines.forEach(line -> {
96 json_file_list.add(line);
97 });
98 } catch (IOException e) {
99 e.printStackTrace();
100 }
101
102 return json_file_list;
103
104 }
105
106 public Text readJSONText(Path json_path)
107 {
108 File json_file = json_path.toFile();
109
110 String json_filename = json_file.toURI().toString();
111 /*
112 try {
113 json_filename = json_file.getCanonicalPath();
114 }
115 catch (Exception e) {
116 e.printStackTrace();
117 }
118 */
119 String text_string = ClusterFileIO.readTextFile(json_filename);
120
121 //ArrayList<String> text_lines = readFileLines(json_path);
122
123 //String text_string = String.join("\n",text_lines);
124
125 Text json_text = new Text(text_string);
126 return json_text;
127
128 }
129
130 public void execPerVolumeSequenceFile()
131 {
132 String serial_app_name = generateAppName();
133 System.out.println(serial_app_name);
134
135 Path json_filelist_path = Paths.get(_input_file);
136
137 // Read in text file
138 ArrayList<String> json_file_list = readFileLines(json_filelist_path);
139
140 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
141 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
142
143 ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection);
144
145 System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory);
146 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_file,_whitelist_filename, _langmap_directory,
147 solr_endpoints,_output_dir,_verbosity,
148 icu_tokenize,strict_file_io);
149
150 // Foreach file, call per_vol_json.call()
151 long num_vol_ids = 0;
152 for (String json_filename : json_file_list) {
153 //Path json_path = Paths.get("file://D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/json-files",json_filename);
154 Path json_path = Paths.get("json-files",json_filename);
155
156
157 System.out.println("*** Processing jsonfile: " + json_path);
158 Text json_text = readJSONText(json_path);
159 try {
160 per_vol_json.call(json_text);
161 } catch (IOException e) {
162 e.printStackTrace();
163 }
164 num_vol_ids++;
165 }
166
167
168 System.out.println("");
169 System.out.println("############");
170 System.out.println("# Number of volume ids: " + num_vol_ids);
171 System.out.println("############");
172 System.out.println("");
173
174
175
176
177 }
178
179
180 public static void print_usage(HelpFormatter formatter, Options options)
181 {
182 formatter.printHelp("RUN.bash [options] input-file solr-collection", options);
183 }
184
185 public static void main(String[] args) {
186 Options options = new Options();
187
188 Option verbosity_opt = new Option("v", "verbosity", true,
189 "Set to control the level of debugging output [0=none, 1=some, 2=lots]");
190 verbosity_opt.setRequired(false);
191 options.addOption(verbosity_opt);
192
193 Option properties_opt = new Option("p", "properties", true,
194 "Read in the specified Java properties file");
195 properties_opt.setRequired(false);
196 options.addOption(properties_opt);
197
198 Option output_dir_opt = new Option("o", "output-dir", true,
199 "If specified, save BZipped Solr JSON files to this directory");
200 output_dir_opt.setRequired(false);
201 options.addOption(output_dir_opt);
202
203 Option solr_base_url_opt = new Option("u", "solr-base-url", true,
204 "If specified, the base URL to post the Solr JSON data to");
205 solr_base_url_opt.setRequired(false);
206 options.addOption(solr_base_url_opt);
207
208 Option read_only_opt = new Option("r", "read-only", false,
209 "Used to initiate a run where the files are all read in, but nothing is ingested/saved");
210 read_only_opt.setRequired(false);
211 options.addOption(read_only_opt);
212
213 // Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
214 CommandLineParser parser = new GnuParser();
215 //CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
216
217 HelpFormatter formatter = new HelpFormatter();
218 CommandLine cmd = null;
219
220 try {
221 cmd = parser.parse(options, args);
222 }
223 catch (ParseException e) {
224 System.err.println(e.getMessage());
225 print_usage(formatter,options);
226 System.exit(1);
227 }
228
229
230 String verbosity_str = cmd.getOptionValue("verbosity","1");
231 int verbosity = Integer.parseInt(verbosity_str);
232
233 String property_filename = cmd.getOptionValue("properties",null);
234
235 String output_dir = cmd.getOptionValue("output-dir",null);
236 String solr_base_url = cmd.getOptionValue("solr-base-url",null);
237 boolean read_only = cmd.hasOption("read-only");
238
239 String[] filtered_args = cmd.getArgs();
240
241 if (filtered_args.length != 2) {
242 print_usage(formatter,options);
243 System.exit(1);
244 }
245
246 if (property_filename != null) {
247 try {
248 FileInputStream fis = new FileInputStream(property_filename);
249 BufferedInputStream bis = new BufferedInputStream(fis);
250
251 System.getProperties().load(bis);
252 }
253 catch (FileNotFoundException e) {
254 // TODO Auto-generated catch block
255 e.printStackTrace();
256 System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
257 }
258 catch (IOException e) {
259 System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
260 }
261 }
262
263 if (!read_only && ((output_dir == null) && (solr_base_url==null))) {
264 System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved");
265 print_usage(formatter,options);
266 System.exit(1);
267 }
268 if (read_only) {
269 // For this case, need to ensure solr-url and output-dir are null
270 output_dir = null;
271 solr_base_url = null;
272 }
273
274 String input_file = filtered_args[0];
275 String solr_collection = filtered_args[1];
276
277 ProcessForSerialSolrIngest prep_for_ingest
278 = new ProcessForSerialSolrIngest(input_file,solr_collection,solr_base_url,output_dir,verbosity);
279
280 prep_for_ingest.execPerVolumeSequenceFile();
281
282 }
283}
Note: See TracBrowser for help on using the repository browser.