source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeJSON.java@ 31372

Last change on this file since 31372 was 31372, checked in by davidb, 7 years ago

Reworked to use sequenceFiles

  • Property svn:executable set to *
File size: 7.5 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.util.ArrayList;
5import java.util.Iterator;
6
7import org.apache.hadoop.io.Text;
8import org.apache.spark.api.java.function.FlatMapFunction;
9import org.apache.spark.api.java.function.Function;
10import org.apache.spark.api.java.function.VoidFunction;
11import org.apache.spark.util.DoubleAccumulator;
12import org.json.JSONArray;
13import org.json.JSONObject;
14
15/*
16class PagedJSON implements Function<String, Boolean> {
17
18 private static final long serialVersionUID = 1L;
19
20 public Boolean call(String s) { return s.contains("a"); }
21}
22 */
23
24
25//public class PerVolumeJSON implements VoidFunction<String>
26public class PerVolumeJSON implements Function<Text,Integer>
27{
28 private static final long serialVersionUID = 1L;
29 protected String _input_dir;
30 protected String _whitelist_filename;
31
32 protected String _solr_url;
33 protected String _output_dir;
34
35 protected int _verbosity;
36
37 protected WhitelistBloomFilter _whitelist_bloomfilter;
38
39
40
41 boolean _icu_tokenize;
42 boolean _strict_file_io;
43
44 public PerVolumeJSON(String input_dir, String whitelist_filename,
45 String solr_url, String output_dir, int verbosity,
46 boolean icu_tokenize, boolean strict_file_io)
47 {
48 _input_dir = input_dir;
49 _whitelist_filename = whitelist_filename;
50
51 _solr_url = solr_url;
52 _output_dir = output_dir;
53 _verbosity = verbosity;
54
55 _icu_tokenize = icu_tokenize;
56 _strict_file_io = strict_file_io;
57
58 _whitelist_bloomfilter = null;
59 }
60
61
62 public Integer call(Text json_text) throws IOException
63
64 {
65 if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
66 _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
67 }
68
69 int ef_num_pages = 0;
70
71 try {
72
73
74 JSONObject extracted_feature_record = new JSONObject(json_text.toString());
75
76 if (extracted_feature_record != null) {
77 String volume_id = extracted_feature_record.getString("id");
78
79 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
80 //String title= ef_metadata.getString("title");
81
82 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
83
84 int ef_page_count = ef_features.getInt("pageCount");
85
86 if (_verbosity >= 1) {
87 System.out.println("Processing: " + volume_id);
88 System.out.println(" pageCount = " + ef_page_count);
89 }
90
91 JSONArray ef_pages = ef_features.getJSONArray("pages");
92 ef_num_pages = ef_pages.length();
93
94
95 for (int i = 0; i < ef_page_count; i++) {
96 String formatted_i = String.format("page-%06d", i);
97 String page_id = volume_id + "." + formatted_i;
98
99 if (_verbosity >= 2) {
100 System.out.println(" Page: " + page_id);
101 }
102
103
104 JSONObject ef_page = ef_pages.getJSONObject(i);
105
106 if (ef_page != null) {
107 // Convert to Solr add form
108 JSONObject solr_add_doc_json
109 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
110
111
112 if ((_verbosity >=2) && (i==20)) {
113 System.out.println("==================");
114 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
115 System.out.println("==================");
116 }
117
118
119 if (_solr_url != null) {
120 if ((_verbosity >=2) && (i==20)) {
121 System.out.println("==================");
122 System.out.println("Posting to: " + _solr_url);
123 System.out.println("==================");
124 }
125 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
126 }
127
128
129 }
130 else {
131 System.err.println("Skipping: " + page_id);
132 }
133
134 }
135 }
136 }
137 catch (Exception e) {
138 if (_strict_file_io) {
139 throw e;
140 }
141 else {
142 e.printStackTrace();
143 }
144 }
145
146 return ef_num_pages;
147
148 }
149
150 /*
151 //public void call(String json_file_in) throws IOException
152 public Integer call(String json_file_in) throws IOException
153
154 {
155 if ((_whitelist_filename != null) && (_whitelist_bloomfilter == null)) {
156 _whitelist_bloomfilter = new WhitelistBloomFilter(_whitelist_filename,true);
157 }
158
159 int ef_num_pages = 0;
160
161 ArrayList<String> ids = new ArrayList<String>(); // want it to be non-null so can return valid iterator
162
163 String full_json_file_in = _input_dir + "/" + json_file_in;
164 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
165
166 if (extracted_feature_record != null) {
167 String volume_id = extracted_feature_record.getString("id");
168
169 //JSONObject ef_metadata = extracted_feature_record.getJSONObject("metadata");
170 //String title= ef_metadata.getString("title");
171
172 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
173
174 int ef_page_count = ef_features.getInt("pageCount");
175
176 if (_verbosity >= 1) {
177 System.out.println("Processing: " + json_file_in);
178 System.out.println(" pageCount = " + ef_page_count);
179 }
180
181 JSONArray ef_pages = ef_features.getJSONArray("pages");
182 ef_num_pages = ef_pages.length();
183
184 // Make directory for page-level JSON output
185 String json_dir = ClusterFileIO.removeSuffix(json_file_in,".json.bz2");
186 String page_json_dir = json_dir + "/pages";
187
188 if (_output_dir != null) {
189 ClusterFileIO.createDirectoryAll(_output_dir + "/" + page_json_dir);
190 }
191
192 ids = new ArrayList<String>(ef_num_pages);
193 for (int i = 0; i < ef_page_count; i++) {
194 String formatted_i = String.format("page-%06d", i);
195 String page_id = volume_id + "." + formatted_i;
196
197 if (_verbosity >= 2) {
198 System.out.println(" Page: " + page_id);
199 }
200
201 String output_json_bz2 = page_json_dir +"/" + formatted_i + ".json.bz2";
202 ids.add(page_id);
203
204 if (_verbosity >=2) {
205 if (i==0) {
206 System.out.println("Sample output JSON page file [i=0]: " + output_json_bz2);
207 }
208 }
209 JSONObject ef_page = ef_pages.getJSONObject(i);
210
211 if (ef_page != null) {
212 // Convert to Solr add form
213 JSONObject solr_add_doc_json
214 = SolrDocJSON.generateSolrDocJSON(volume_id, page_id, ef_page, _whitelist_bloomfilter, _icu_tokenize);
215
216
217 if ((_verbosity >=2) && (i==20)) {
218 System.out.println("==================");
219 System.out.println("Sample output Solr add JSON [page 20]: " + solr_add_doc_json.toString());
220 System.out.println("==================");
221 }
222
223
224 if (_solr_url != null) {
225 if ((_verbosity >=2) && (i==20)) {
226 System.out.println("==================");
227 System.out.println("Posting to: " + _solr_url);
228 System.out.println("==================");
229 }
230 SolrDocJSON.postSolrDoc(_solr_url, solr_add_doc_json);
231 }
232
233 if (_output_dir != null) {
234 if ((_verbosity >=2) && (i==20)) {
235 System.out.println("==================");
236 System.out.println("Saving to: " + _output_dir);
237 System.out.println("==================");
238 }
239 SolrDocJSON.saveSolrDoc(solr_add_doc_json, _output_dir + "/" + output_json_bz2);
240 }
241 }
242 else {
243 System.err.println("Skipping: " + page_id);
244 }
245
246 }
247 }
248 else {
249 // File did not exist, or could not be parsed
250 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
251 if (_strict_file_io) {
252 throw new IOException(mess);
253 }
254 else {
255 System.err.println("Warning: " + mess);
256 System.out.println("Warning: " + mess);
257 }
258 }
259
260 return ef_num_pages;
261
262 }
263 */
264}
265
Note: See TracBrowser for help on using the repository browser.