source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31315

Last change on this file since 31315 was 31315, checked in by davidb, 7 years ago

Further tweak

  • Property svn:executable set to *
File size: 7.9 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import java.io.Reader;
5import java.io.StringReader;
6import java.util.ArrayList;
7import java.util.Iterator;
8import java.util.List;
9import java.util.Set;
10
11import org.apache.lucene.analysis.TokenStream;
12import org.apache.lucene.analysis.core.LowerCaseFilter;
13import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
14import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15import org.apache.spark.api.java.function.Function;
16import org.apache.spark.util.DoubleAccumulator;
17import org.bson.Document;
18import org.json.JSONArray;
19import org.json.JSONObject;
20
21import com.mongodb.MongoClient;
22import com.mongodb.MongoClientURI;
23import com.mongodb.MongoException;
24import com.mongodb.client.MongoCollection;
25import com.mongodb.client.MongoDatabase;
26
27
28class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
29{
30 private static final long serialVersionUID = 1L;
31
32 protected String _input_dir;
33 protected int _verbosity;
34
35 protected DoubleAccumulator _progress_accum;
36 protected double _progress_step;
37
38 boolean _strict_file_io;
39
40 public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
41 DoubleAccumulator progress_accum, double progress_step,
42 boolean strict_file_io)
43 {
44 _input_dir = input_dir;
45 _verbosity = verbosity;
46
47 _progress_accum = progress_accum;
48 _progress_step = progress_step;
49
50 _strict_file_io = strict_file_io;
51 }
52
53 protected void fixup_section(Document ef_count)
54 {
55
56 Set<String> key_set = ef_count.keySet();
57 String[] key_array = key_set.toArray(new String[key_set.size()]);
58
59
60 //Set<String> key_set = ef_count.keySet();
61 //for (String key : key_set) {
62
63 //Iterator<String> key_iterator = ef_count.keySet().iterator();
64 //while (key_iterator.hasNext()) {
65 for (int i=0; i<key_array.length; i++) {
66
67 String key = key_array[i];
68 //String key = key_iterator.next();
69 if (key.matches("\\.")) {
70 String new_key = key.replaceAll("\\.", "<PERIOD>");
71 ef_count.put(new_key, ef_count.get(key));
72 ef_count.remove(key);
73 key = new_key;
74 }
75
76 if (key.matches("\\$")) {
77 String new_key = key.replaceAll("\\$", "<DOLLAR>");
78 ef_count.put(new_key, ef_count.get(key));
79 ef_count.remove(key);
80 }
81
82 }
83 }
84
85 protected void fixup_page(String volume_id, String page_id, Document ef_page)
86 {
87 if (ef_page != null) {
88 String[] zone_keys = { "header", "body", "footer" };
89
90 for (String zone_key: zone_keys) {
91 Document ef_zone = (Document)ef_page.get(zone_key);
92 if (ef_zone != null) {
93 String[] count_keys = { "beginCharCounts", "endCharCount", "tokenPosCount" };
94
95 for (String count_key: count_keys) {
96 Document ef_sub_section = (Document)ef_zone.get(count_key);
97 if (ef_sub_section != null) {
98 fixup_section(ef_sub_section);
99
100 if (count_key.equals("tokenPostCount")) {
101 Set<String> key_set = ef_sub_section.keySet();
102 for (String key : key_set) {
103 Document token_section = (Document)ef_sub_section.get(key);
104 fixup_section(token_section);
105 }
106 }
107
108
109 }
110 }
111 }
112 }
113 }
114 else {
115 System.err.println("Warning: null page for '" + page_id + "'");
116 }
117
118 }
119 protected void fixup_volume(String json_file_in, Document extracted_feature_record)
120 {
121 String full_json_file_in = _input_dir + "/" + json_file_in;
122
123 if (extracted_feature_record != null) {
124 String volume_id = extracted_feature_record.getString("id");
125 extracted_feature_record.put("_id",volume_id);
126 extracted_feature_record.remove("id");
127
128 Document ef_features = (Document)extracted_feature_record.get("features");
129
130 int ef_page_count = ef_features.getInteger("pageCount");
131
132 if (_verbosity >= 1) {
133 System.out.println("Processing: " + json_file_in);
134 System.out.println(" pageCount = " + ef_page_count);
135 }
136
137 List<Document> ef_pages = (List<Document>)ef_features.get("pages");
138 int ef_num_pages = ef_pages.size();
139 if (ef_num_pages != ef_page_count) {
140 System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
141 +" does not match 'pageCount' metadata (" + ef_page_count + ")");
142 }
143
144 if (_verbosity >= 2) {
145 System.out.print(" Pages: ");
146 }
147
148 for (int i = 0; i < ef_page_count; i++) {
149 String formatted_i = String.format("page-%06d", i);
150 String page_id = volume_id + "." + formatted_i;
151
152 if (_verbosity >= 2) {
153 if (i>0) {
154 System.out.print(", ");
155 }
156 System.out.print(page_id);
157 }
158
159 if (i==(ef_page_count-1)) {
160 if (_verbosity >= 2) {
161 System.out.println();
162 }
163 }
164
165 Document ef_page = (Document)ef_pages.get(i);
166
167 if (ef_page != null) {
168
169 fixup_page(volume_id, page_id, ef_page);
170 }
171 else {
172 System.err.println("Skipping: " + page_id);
173 }
174 }
175 }
176 else {
177 // File did not exist, or could not be parsed
178 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
179
180 System.err.println("Warning: " + mess);
181 System.out.println("Warning: " + mess);
182
183 }
184 }
185
186 public Integer call(String json_file_in) throws IOException
187 {
188 try {
189 MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
190 MongoClient mongoClient = new MongoClient(mongo_url);
191
192 MongoDatabase database = mongoClient.getDatabase("htrc_ef");
193 MongoCollection<Document> collection = database.getCollection("volumes");
194
195 String full_json_file_in = _input_dir + "/" + json_file_in;
196 System.out.println("Processing: " + full_json_file_in);
197 String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
198
199 Document doc = Document.parse(extracted_feature_json_doc);
200
201 fixup_volume(json_file_in,doc);
202
203 collection.insertOne(doc);
204
205 /*
206 //Mongo mongo = new Mongo("localhost", 27017);
207 MongoClient mongo = new MongoClient( "localhost" , 27017 );
208
209 DB db = mongo.getDB("yourdb");
210 DBCollection coll = db.getCollection("dummyColl");
211
212 // convert JSON to DBObject directly
213 DBObject dbObject = (DBObject) JSON
214 .parse("{'name':'mkyong', 'age':30}");
215 coll.insert(dbObject);
216
217
218 DBCursor cursorDoc = coll.find();
219 while (cursorDoc.hasNext()) {
220 System.out.println(cursorDoc.next());
221 }
222
223 System.out.println("Done");
224*/
225 mongoClient.close();
226
227 } catch (MongoException e) {
228 e.printStackTrace();
229 }
230
231 return 1;
232 }
233 public Integer callPageCount(String json_file_in) throws IOException
234 {
235 Integer page_count = 0;
236
237 String full_json_file_in = _input_dir + "/" + json_file_in;
238 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
239
240 if (extracted_feature_record != null) {
241 String volume_id = extracted_feature_record.getString("id");
242
243 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
244
245 if (_verbosity >= 1) {
246 System.out.println("Processing: " + json_file_in);
247 }
248
249 if (ef_features != null) {
250 String page_count_str = ef_features.getString("pageCount");
251 if (!page_count_str.equals("")) {
252 page_count = Integer.parseInt(page_count_str);
253 }
254 else {
255 System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
256 }
257 }
258 else {
259 System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
260 }
261
262 }
263 else {
264 // File did not exist, or could not be parsed
265 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
266 if (_strict_file_io) {
267 throw new IOException(mess);
268 }
269 else {
270 System.err.println("Warning: " + mess);
271 System.out.println("Warning: " + mess);
272 }
273 }
274
275 _progress_accum.add(_progress_step);
276
277 return page_count;
278 }
279
280
281}
282
Note: See TracBrowser for help on using the repository browser.