source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31317

Last change on this file since 31317 was 31317, checked in by davidb, 7 years ago

added debug statement

  • Property svn:executable set to *
File size: 8.0 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.io.Reader;
5import java.io.StringReader;
6import java.util.ArrayList;
7import java.util.Iterator;
8import java.util.List;
9import java.util.Set;
10
11import org.apache.lucene.analysis.TokenStream;
12import org.apache.lucene.analysis.core.LowerCaseFilter;
13import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
14import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15import org.apache.spark.api.java.function.Function;
16import org.apache.spark.util.DoubleAccumulator;
17import org.bson.Document;
18import org.json.JSONArray;
19import org.json.JSONObject;
20
21import com.mongodb.MongoClient;
22import com.mongodb.MongoClientURI;
23import com.mongodb.MongoException;
24import com.mongodb.client.MongoCollection;
25import com.mongodb.client.MongoDatabase;
26
27
28class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
29{
30 private static final long serialVersionUID = 1L;
31
32 protected String _input_dir;
33 protected int _verbosity;
34
35 protected DoubleAccumulator _progress_accum;
36 protected double _progress_step;
37
38 boolean _strict_file_io;
39
40 public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
41 DoubleAccumulator progress_accum, double progress_step,
42 boolean strict_file_io)
43 {
44 _input_dir = input_dir;
45 _verbosity = verbosity;
46
47 _progress_accum = progress_accum;
48 _progress_step = progress_step;
49
50 _strict_file_io = strict_file_io;
51 }
52
53 protected void fixup_section(Document ef_count)
54 {
55
56 Set<String> key_set = ef_count.keySet();
57 String[] key_array = key_set.toArray(new String[key_set.size()]);
58
59
60 //Set<String> key_set = ef_count.keySet();
61 //for (String key : key_set) {
62
63 //Iterator<String> key_iterator = ef_count.keySet().iterator();
64 //while (key_iterator.hasNext()) {
65 for (int i=0; i<key_array.length; i++) {
66
67 String key = key_array[i];
68 //String key = key_iterator.next();
69 if (key.matches("\\.")) {
70 String new_key = key.replaceAll("\\.", "<PERIOD>");
71 System.out.println("**** old key:" + key + "=> new key:" + new_key);
72 ef_count.put(new_key, ef_count.get(key));
73 ef_count.remove(key);
74 key = new_key;
75 }
76
77 if (key.matches("\\$")) {
78 String new_key = key.replaceAll("\\$", "<DOLLAR>");
79 ef_count.put(new_key, ef_count.get(key));
80 ef_count.remove(key);
81 }
82
83 }
84 }
85
86 protected void fixup_page(String volume_id, String page_id, Document ef_page)
87 {
88 if (ef_page != null) {
89 String[] zone_keys = { "header", "body", "footer" };
90
91 for (String zone_key: zone_keys) {
92 Document ef_zone = (Document)ef_page.get(zone_key);
93 if (ef_zone != null) {
94 String[] count_keys = { "beginCharCounts", "endCharCount", "tokenPosCount" };
95
96 for (String count_key: count_keys) {
97 Document ef_sub_section = (Document)ef_zone.get(count_key);
98 if (ef_sub_section != null) {
99 fixup_section(ef_sub_section);
100
101 if (count_key.equals("tokenPosCount")) {
102 Set<String> key_set = ef_sub_section.keySet();
103 for (String key : key_set) {
104 Document token_section = (Document)ef_sub_section.get(key);
105 fixup_section(token_section);
106 }
107 }
108
109
110 }
111 }
112 }
113 }
114 }
115 else {
116 System.err.println("Warning: null page for '" + page_id + "'");
117 }
118
119 }
120 protected void fixup_volume(String json_file_in, Document extracted_feature_record)
121 {
122 String full_json_file_in = _input_dir + "/" + json_file_in;
123
124 if (extracted_feature_record != null) {
125 String volume_id = extracted_feature_record.getString("id");
126 extracted_feature_record.put("_id",volume_id);
127 extracted_feature_record.remove("id");
128
129 Document ef_features = (Document)extracted_feature_record.get("features");
130
131 int ef_page_count = ef_features.getInteger("pageCount");
132
133 if (_verbosity >= 1) {
134 System.out.println("Processing: " + json_file_in);
135 System.out.println(" pageCount = " + ef_page_count);
136 }
137
138 List<Document> ef_pages = (List<Document>)ef_features.get("pages");
139 int ef_num_pages = ef_pages.size();
140 if (ef_num_pages != ef_page_count) {
141 System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
142 +" does not match 'pageCount' metadata (" + ef_page_count + ")");
143 }
144
145 if (_verbosity >= 2) {
146 System.out.print(" Pages: ");
147 }
148
149 for (int i = 0; i < ef_page_count; i++) {
150 String formatted_i = String.format("page-%06d", i);
151 String page_id = volume_id + "." + formatted_i;
152
153 if (_verbosity >= 2) {
154 if (i>0) {
155 System.out.print(", ");
156 }
157 System.out.print(page_id);
158 }
159
160 if (i==(ef_page_count-1)) {
161 if (_verbosity >= 2) {
162 System.out.println();
163 }
164 }
165
166 Document ef_page = (Document)ef_pages.get(i);
167
168 if (ef_page != null) {
169
170 fixup_page(volume_id, page_id, ef_page);
171 }
172 else {
173 System.err.println("Skipping: " + page_id);
174 }
175 }
176 }
177 else {
178 // File did not exist, or could not be parsed
179 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
180
181 System.err.println("Warning: " + mess);
182 System.out.println("Warning: " + mess);
183
184 }
185 }
186
187 public Integer call(String json_file_in) throws IOException
188 {
189 try {
190 MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
191 MongoClient mongoClient = new MongoClient(mongo_url);
192
193 MongoDatabase database = mongoClient.getDatabase("htrc_ef");
194 MongoCollection<Document> collection = database.getCollection("volumes");
195
196 String full_json_file_in = _input_dir + "/" + json_file_in;
197 System.out.println("Processing: " + full_json_file_in);
198 String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
199
200 Document doc = Document.parse(extracted_feature_json_doc);
201
202 fixup_volume(json_file_in,doc);
203
204 collection.insertOne(doc);
205
206 /*
207 //Mongo mongo = new Mongo("localhost", 27017);
208 MongoClient mongo = new MongoClient( "localhost" , 27017 );
209
210 DB db = mongo.getDB("yourdb");
211 DBCollection coll = db.getCollection("dummyColl");
212
213 // convert JSON to DBObject directly
214 DBObject dbObject = (DBObject) JSON
215 .parse("{'name':'mkyong', 'age':30}");
216 coll.insert(dbObject);
217
218
219 DBCursor cursorDoc = coll.find();
220 while (cursorDoc.hasNext()) {
221 System.out.println(cursorDoc.next());
222 }
223
224 System.out.println("Done");
225*/
226 mongoClient.close();
227
228 } catch (MongoException e) {
229 e.printStackTrace();
230 }
231
232 return 1;
233 }
234 public Integer callPageCount(String json_file_in) throws IOException
235 {
236 Integer page_count = 0;
237
238 String full_json_file_in = _input_dir + "/" + json_file_in;
239 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
240
241 if (extracted_feature_record != null) {
242 String volume_id = extracted_feature_record.getString("id");
243
244 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
245
246 if (_verbosity >= 1) {
247 System.out.println("Processing: " + json_file_in);
248 }
249
250 if (ef_features != null) {
251 String page_count_str = ef_features.getString("pageCount");
252 if (!page_count_str.equals("")) {
253 page_count = Integer.parseInt(page_count_str);
254 }
255 else {
256 System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
257 }
258 }
259 else {
260 System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
261 }
262
263 }
264 else {
265 // File did not exist, or could not be parsed
266 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
267 if (_strict_file_io) {
268 throw new IOException(mess);
269 }
270 else {
271 System.err.println("Warning: " + mess);
272 System.out.println("Warning: " + mess);
273 }
274 }
275
276 _progress_accum.add(_progress_step);
277
278 return page_count;
279 }
280
281
282}
283
Note: See TracBrowser for help on using the repository browser.