source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeUtil.java@ 32109

Last change on this file since 32109 was 32109, checked in by davidb, 6 years ago

Changes made after testing through YARN

  • Property svn:executable set to *
File size: 9.2 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.io.Serializable;
5import java.util.ArrayList;
6import java.util.HashMap;
7import java.util.Iterator;
8
9import org.apache.hadoop.io.Text;
10import org.apache.spark.api.java.function.FlatMapFunction;
11import org.apache.spark.api.java.function.Function;
12import org.apache.spark.api.java.function.VoidFunction;
13import org.apache.spark.util.DoubleAccumulator;
14import org.json.JSONArray;
15import org.json.JSONObject;
16
17public class PerVolumeUtil implements Serializable
18{
19 private static final long serialVersionUID = 1L;
20 protected String _input_dir;
21 protected String _whitelist_filename;
22 protected String _langmap_directory;
23
24 protected final ArrayList<String> _solr_endpoints;
25 protected final int _solr_endpoints_len;
26
27 protected String _output_dir;
28
29 protected int _verbosity;
30
31 protected WhitelistBloomFilter _whitelist_bloomfilter;
32 protected UniversalPOSLangMap _universal_langmap;
33
34 boolean _icu_tokenize;
35 boolean _strict_file_io;
36
37 public PerVolumeUtil(String input_dir, String whitelist_filename, String langmap_directory,
38 ArrayList<String> solr_endpoints, String output_dir, int verbosity,
39 boolean icu_tokenize, boolean strict_file_io)
40 {
41 System.out.println("*** PerVolumeUtil Constructor, langmap_directory = " + langmap_directory);
42
43 _input_dir = input_dir;
44 _whitelist_filename = whitelist_filename;
45 _langmap_directory = langmap_directory;
46
47 _solr_endpoints = solr_endpoints;
48 _solr_endpoints_len = solr_endpoints.size();
49
50 //_solr_url = solr_url;
51 _output_dir = output_dir;
52 _verbosity = verbosity;
53
54 _icu_tokenize = icu_tokenize;
55 _strict_file_io = strict_file_io;
56
57 _whitelist_bloomfilter = null;
58 _universal_langmap = null;
59 }
60
61 public String getInputDir()
62 {
63 return _input_dir;
64 }
65
66 public Integer call(Text json_text) throws IOException
67
68 {
69 if (_whitelist_filename != null) {
70
71 synchronized (_whitelist_filename) {
72 if (_whitelist_bloomfilter == null) {
73
74 _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
75 }
76 }
77 }
78
79 if (_langmap_directory != null) {
80
81 synchronized (_langmap_directory) {
82 if (_universal_langmap == null) {
83 _universal_langmap = new UniversalPOSLangMap(_langmap_directory);
84 }
85 }
86 }
87
88 int ef_num_pages = 0;
89
90 String solr_url = null;
91 if (_solr_endpoints_len > 0) {
92 int random_choice = (int)(_solr_endpoints_len * Math.random());
93 solr_url = _solr_endpoints.get(random_choice);
94 }
95
96 try {
97
98
99 JSONObject extracted_feature_record = new JSONObject(json_text.toString());
100
101 if (extracted_feature_record != null) {
102 String volume_id = extracted_feature_record.getString("id");
103
104 JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
105 //String title= ef_metadata.getString("title");
106
107 //
108 // Top-level metadata Solr doc
109 //
110 JSONObject solr_add_metadata_doc_json = SolrDocJSON.generateToplevelMetadataSolrDocJSON(volume_id,ef_metadata);
111 if (solr_add_metadata_doc_json != null) {
112
113 if ((_verbosity >=2)) {
114 System.out.println("==================");
115 System.out.println("Metadata JSON: " + solr_add_metadata_doc_json.toString());
116 System.out.println("==================");
117 }
118
119 if (solr_url != null) {
120
121 if ((_verbosity >=2) ) {
122 System.out.println("==================");
123 System.out.println("Posting to: " + solr_url);
124 System.out.println("==================");
125 }
126 SolrDocJSON.postSolrDoc(solr_url, solr_add_metadata_doc_json, volume_id, "top-level-metadata");
127 }
128 }
129
130 //
131 // Now move on to POS extracted features per-page
132 //
133 boolean index_pages = true;
134 if (index_pages) {
135
136 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
137
138 int ef_page_count = ef_features.getInt("pageCount");
139
140 if (_verbosity >= 1) {
141 System.out.println("Processing: " + volume_id);
142 System.out.println(" pageCount = " + ef_page_count);
143 }
144
145 JSONArray ef_pages = ef_features.getJSONArray("pages");
146 ef_num_pages = ef_pages.length();
147
148
149 for (int i = 0; i < ef_page_count; i++) {
150 String formatted_i = String.format("page-%06d", i);
151 String page_id = volume_id + "." + formatted_i;
152
153 if (_verbosity >= 2) {
154 System.out.println(" Page: " + page_id);
155 }
156
157
158 JSONObject ef_page = ef_pages.getJSONObject(i);
159
160 if (ef_page != null) {
161 // Convert to Solr add form
162 JSONObject solr_add_doc_json
163 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id,
164 ef_metadata, ef_page,
165 _whitelist_bloomfilter, _universal_langmap, _icu_tokenize);
166
167
168 if ((_verbosity >=2) && (i==20)) {
169 System.out.println("==================");
170 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
171 System.out.println("==================");
172 }
173
174
175 if (solr_url != null) {
176 SolrDocJSON.postSolrDoc(solr_url, solr_add_doc_json,
177 volume_id, page_id);
178 }
179 }
180 else {
181 System.err.println("Skipping: " + page_id);
182 }
183
184 }
185 }
186 else {
187 System.err.println("Skipping per-page POS text indexing");
188 }
189
190 }
191 }
192 catch (Exception e) {
193 if (_strict_file_io) {
194 throw e;
195 }
196 else {
197 e.printStackTrace();
198 }
199 }
200
201 return ef_num_pages;
202
203 }
204
205 /*
206 //public void call(String json_file_in) throws IOException
207 public Integer call(String json_file_in) throws IOException
208
209 {
210 if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
211 _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
212 }
213
214 int ef_num_pages = 0;
215
216 ArrayList<String> ids = new ArrayList<String>(); // want it to be non-null so can return valid iterator
217
218 String full_json_file_in = _input_dir + "/" + json_file_in;
219 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
220
221 if (extracted_feature_record != null) {
222 String volume_id = extracted_feature_record.getString("id");
223
224 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
225 //String title= ef_metadata.getString("title");
226
227 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
228
229 int ef_page_count = ef_features.getInt("pageCount");
230
231 if (_verbosity >= 1) {
232 System.out.println("Processing: " + json_file_in);
233 System.out.println(" pageCount = " + ef_page_count);
234 }
235
236 JSONArray ef_pages = ef_features.getJSONArray("pages");
237 ef_num_pages = ef_pages.length();
238
239 // Make directory for page-level JSON output
240 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
241 String page_json_dir = json_dir + "/pages";
242
243 if (_output_dir != null) {
244 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
245 }
246
247 ids = new ArrayList<String>(ef_num_pages);
248 for (int i = 0; i < ef_page_count; i++) {
249 String formatted_i = String.format("page-%06d", i);
250 String page_id = volume_id + "." + formatted_i;
251
252 if (_verbosity >= 2) {
253 System.out.println(" Page: " + page_id);
254 }
255
256 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
257 ids.add(page_id);
258
259 if (_verbosity >=2) {
260 if (i==0) {
261 System.out.println("Sample output JSON page file [i=0]: " + output_json_bz2);
262 }
263 }
264 JSONObject ef_page = ef_pages.getJSONObject(i);
265
266 if (ef_page != null) {
267 // Convert to Solr add form
268 JSONObject solr_add_doc_json
269 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
270
271
272 if ((_verbosity >=2) && (i==20)) {
273 System.out.println("==================");
274 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
275 System.out.println("==================");
276 }
277
278
279 if (_solr_url != null) {
280 if ((_verbosity >=2) && (i==20)) {
281 System.out.println("==================");
282 System.out.println("Posting to: " + _solr_url);
283 System.out.println("==================");
284 }
285 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
286 }
287
288 if (_output_dir != null) {
289 if ((_verbosity >=2) && (i==20)) {
290 System.out.println("==================");
291 System.out.println("Saving to: " + _output_dir);
292 System.out.println("==================");
293 }
294 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
295 }
296 }
297 else {
298 System.err.println("Skipping: " + page_id);
299 }
300
301 }
302 }
303 else {
304 // File did not exist, or could not be parsed
305 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
306 if (_strict_file_io) {
307 throw new IOException(mess);
308 }
309 else {
310 System.err.println("Warning: " + mess);
311 System.out.println("Warning: " + mess);
312 }
313 }
314
315 return ef_num_pages;
316
317 }
318 */
319}
320
Note: See TracBrowser for help on using the repository browser.