source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/ProcessForSerialSolrIngest.java@ 32103

Last change on this file since 32103 was 32103, checked in by davidb, 6 years ago

Tidy up of output

  • Property svn:executable set to *
File size: 8.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.BufferedInputStream;
4import java.io.File;
5import java.io.FileInputStream;
6import java.io.FileNotFoundException;
7import java.io.IOException;
8import java.nio.file.Files;
9import java.nio.file.Path;
10import java.nio.file.Paths;
11import java.util.ArrayList;
12import java.util.stream.Stream;
13
14import org.apache.commons.cli.*;
15import org.apache.hadoop.io.Text;
16
17public class ProcessForSerialSolrIngest
18{
19 //private static final long serialVersionUID = 1L;
20
21 protected String _input_file;
22 protected String _solr_base_url;
23 protected String _solr_collection;
24
25 protected String _whitelist_filename;
26 protected String _langmap_directory;
27
28 //protected String _solr_url;
29 protected String _output_dir;
30
31 protected int _verbosity;
32
33 public ProcessForSerialSolrIngest(String input_file, String solr_collection,
34 String solr_base_url, String output_dir, int verbosity)
35 {
36 _input_file = input_file;
37 _solr_collection = solr_collection;
38
39 boolean use_whitelist = Boolean.getBoolean("wcsa-ef-ingest.use-whitelist");
40 _whitelist_filename = (use_whitelist) ? System.getProperty("wcsa-ef-ingest.whitelist-filename") : null;
41
42 boolean use_langmap = Boolean.getBoolean("wcsa-ef-ingest.use-langmap");
43 _langmap_directory = (use_langmap) ? System.getProperty("wcsa-ef-ingest.langmap-directory") : null;
44
45
46 _solr_base_url = solr_base_url;
47 _output_dir = output_dir;
48 _verbosity = verbosity;
49 }
50
51 protected String generateAppName()
52 {
53 String app_name = "Extract Features: Process for Serial Solr Ingest";
54 app_name += " [" + _solr_collection + "]";
55
56 if (_solr_base_url != null) {
57 app_name += " solr_base_url=" + _solr_base_url;
58 }
59
60 if (_output_dir != null) {
61 app_name += " output_dir=" + _output_dir;
62 }
63
64 return app_name;
65 }
66
67 public ArrayList<String> extrapolateSolrEndpoints(String solr_collection)
68 {
69 ArrayList<String> solr_endpoints = new ArrayList<String>();
70
71 if (_solr_base_url != null) {
72 String solr_url = _solr_base_url + "/" + solr_collection + "/update";
73
74 String solr_cloud_nodes = System.getProperty("wcsa-ef-ingest.solr-cloud-nodes",null);
75 if (solr_cloud_nodes != null) {
76 String [] cloud_nodes = solr_cloud_nodes.split(",");
77 for (String cn : cloud_nodes) {
78 String solr_endpoint = solr_url.replaceFirst("//.*?:\\d+/", "//"+cn+"/");
79 solr_endpoints.add(solr_endpoint);
80 }
81 }
82 else {
83 solr_endpoints.add(solr_url);
84 }
85 }
86
87 return solr_endpoints;
88 }
89
90 public ArrayList<String> readFileLines(Path list_path)
91 {
92 ArrayList<String> json_file_list = new ArrayList<String>();
93
94 try (Stream<String> list_lines = Files.lines(list_path)) {
95 list_lines.forEach(line -> {
96 json_file_list.add(line);
97 });
98 } catch (IOException e) {
99 e.printStackTrace();
100 }
101
102 return json_file_list;
103
104 }
105
106 public Text readJSONText(Path json_path)
107 {
108 File json_file = json_path.toFile();
109
110 String json_filename = json_file.toURI().toString();
111 /*
112 try {
113 json_filename = json_file.getCanonicalPath();
114 }
115 catch (Exception e) {
116 e.printStackTrace();
117 }
118 */
119 String text_string = ClusterFileIO.readTextFile(json_filename);
120
121 //ArrayList<String> text_lines = readFileLines(json_path);
122
123 //String text_string = String.join("\n",text_lines);
124
125 Text json_text = new Text(text_string);
126 return json_text;
127
128 }
129
130 public void execPerVolumeSequenceFile()
131 {
132 String serial_app_name = generateAppName();
133 System.out.println(serial_app_name);
134
135 Path json_filelist_path = Paths.get(_input_file);
136
137 // Read in text file
138 ArrayList<String> json_file_list = readFileLines(json_filelist_path);
139
140 boolean icu_tokenize = Boolean.getBoolean("wcsa-ef-ingest.icu-tokenize");
141 boolean strict_file_io = Boolean.getBoolean("wcsa-ef-ingest.strict-file-io");
142
143 ArrayList<String> solr_endpoints = extrapolateSolrEndpoints(_solr_collection);
144
145 System.out.println("*** away to create PerVolumeJSON class, _langmap_directory = " + _langmap_directory);
146 PerVolumeJSON per_vol_json = new PerVolumeJSON(_input_file,_whitelist_filename, _langmap_directory,
147 solr_endpoints,_output_dir,_verbosity,
148 icu_tokenize,strict_file_io);
149
150 // Foreach file, call per_vol_json.call()
151 long num_vol_ids = 0;
152 long json_file_list_len = json_file_list.size();
153 for (String json_filename : json_file_list) {
154 //Path json_path = Paths.get("file://D:/cygwin64/home/davidb/research/code-managed/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/json-files",json_filename);
155 Path json_path = Paths.get("json-files",json_filename);
156
157
158 System.out.println("Processing jsonfile: " + json_path);
159 Text json_text = readJSONText(json_path);
160 try {
161 per_vol_json.call(json_text);
162 } catch (IOException e) {
163 e.printStackTrace();
164 }
165 num_vol_ids++;
166 System.out.println("+ Processed " + num_vol_ids + "/" + json_file_list_len);
167 }
168
169
170 System.out.println("");
171 System.out.println("############");
172 System.out.println("# Number of volume ids: " + num_vol_ids);
173 System.out.println("############");
174 System.out.println("");
175
176
177
178
179 }
180
181
182 public static void print_usage(HelpFormatter formatter, Options options)
183 {
184 formatter.printHelp("RUN.bash [options] input-file solr-collection", options);
185 }
186
187 public static void main(String[] args) {
188 Options options = new Options();
189
190 Option verbosity_opt = new Option("v", "verbosity", true,
191 "Set to control the level of debugging output [0=none, 1=some, 2=lots]");
192 verbosity_opt.setRequired(false);
193 options.addOption(verbosity_opt);
194
195 Option properties_opt = new Option("p", "properties", true,
196 "Read in the specified Java properties file");
197 properties_opt.setRequired(false);
198 options.addOption(properties_opt);
199
200 Option output_dir_opt = new Option("o", "output-dir", true,
201 "If specified, save BZipped Solr JSON files to this directory");
202 output_dir_opt.setRequired(false);
203 options.addOption(output_dir_opt);
204
205 Option solr_base_url_opt = new Option("u", "solr-base-url", true,
206 "If specified, the base URL to post the Solr JSON data to");
207 solr_base_url_opt.setRequired(false);
208 options.addOption(solr_base_url_opt);
209
210 Option read_only_opt = new Option("r", "read-only", false,
211 "Used to initiate a run where the files are all read in, but nothing is ingested/saved");
212 read_only_opt.setRequired(false);
213 options.addOption(read_only_opt);
214
215 // Need to work with CLI v1.2 as this is the JAR that is bundled with Hadoop/Spark
216 CommandLineParser parser = new GnuParser();
217 //CommandLineParser parser = new DefaultParser(); // if working with CLI v1.3 and above
218
219 HelpFormatter formatter = new HelpFormatter();
220 CommandLine cmd = null;
221
222 try {
223 cmd = parser.parse(options, args);
224 }
225 catch (ParseException e) {
226 System.err.println(e.getMessage());
227 print_usage(formatter,options);
228 System.exit(1);
229 }
230
231
232 String verbosity_str = cmd.getOptionValue("verbosity","1");
233 int verbosity = Integer.parseInt(verbosity_str);
234
235 String property_filename = cmd.getOptionValue("properties",null);
236
237 String output_dir = cmd.getOptionValue("output-dir",null);
238 String solr_base_url = cmd.getOptionValue("solr-base-url",null);
239 boolean read_only = cmd.hasOption("read-only");
240
241 String[] filtered_args = cmd.getArgs();
242
243 if (filtered_args.length != 2) {
244 print_usage(formatter,options);
245 System.exit(1);
246 }
247
248 if (property_filename != null) {
249 try {
250 FileInputStream fis = new FileInputStream(property_filename);
251 BufferedInputStream bis = new BufferedInputStream(fis);
252
253 System.getProperties().load(bis);
254 }
255 catch (FileNotFoundException e) {
256 // TODO Auto-generated catch block
257 e.printStackTrace();
258 System.err.println("File not found: '" + property_filename + "'. Skipping property file read");
259 }
260 catch (IOException e) {
261 System.err.println("IO Exception for: '" + property_filename + "'. Malformed syntax? Skipping property file read");
262 }
263 }
264
265 if (!read_only && ((output_dir == null) && (solr_base_url==null))) {
266 System.err.println("Need to specify either --solr-base-url or --output-dir otherwise generated files are not ingested/saved");
267 print_usage(formatter,options);
268 System.exit(1);
269 }
270 if (read_only) {
271 // For this case, need to ensure solr-url and output-dir are null
272 output_dir = null;
273 solr_base_url = null;
274 }
275
276 String input_file = filtered_args[0];
277 String solr_collection = filtered_args[1];
278
279 ProcessForSerialSolrIngest prep_for_ingest
280 = new ProcessForSerialSolrIngest(input_file,solr_collection,solr_base_url,output_dir,verbosity);
281
282 prep_for_ingest.execPerVolumeSequenceFile();
283
284 }
285}
Note: See TracBrowser for help on using the repository browser.