Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31315

Last change on this file since 31315 was 31315, checked in by davidb, 7 years ago
Further tweak
Property svn:executable set to ``*
File size: 7.9 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.IOException;
4	import java.io.Reader;
5	import java.io.StringReader;
6	import java.util.ArrayList;
7	import java.util.Iterator;
8	import java.util.List;
9	import java.util.Set;
10
11	import org.apache.lucene.analysis.TokenStream;
12	import org.apache.lucene.analysis.core.LowerCaseFilter;
13	import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
14	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15	import org.apache.spark.api.java.function.Function;
16	import org.apache.spark.util.DoubleAccumulator;
17	import org.bson.Document;
18	import org.json.JSONArray;
19	import org.json.JSONObject;
20
21	import com.mongodb.MongoClient;
22	import com.mongodb.MongoClientURI;
23	import com.mongodb.MongoException;
24	import com.mongodb.client.MongoCollection;
25	import com.mongodb.client.MongoDatabase;
26
27
28	class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
29	{
30	private static final long serialVersionUID = 1L;
31
32	protected String _input_dir;
33	protected int _verbosity;
34
35	protected DoubleAccumulator _progress_accum;
36	protected double _progress_step;
37
38	boolean _strict_file_io;
39
40	public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
41	DoubleAccumulator progress_accum, double progress_step,
42	boolean strict_file_io)
43	{
44	_input_dir = input_dir;
45	_verbosity = verbosity;
46
47	_progress_accum = progress_accum;
48	_progress_step = progress_step;
49
50	_strict_file_io = strict_file_io;
51	}
52
53	protected void fixup_section(Document ef_count)
54	{
55
56	Set<String> key_set = ef_count.keySet();
57	String[] key_array = key_set.toArray(new String[key_set.size()]);
58
59
60	//Set<String> key_set = ef_count.keySet();
61	//for (String key : key_set) {
62
63	//Iterator<String> key_iterator = ef_count.keySet().iterator();
64	//while (key_iterator.hasNext()) {
65	for (int i=0; i<key_array.length; i++) {
66
67	String key = key_array[i];
68	//String key = key_iterator.next();
69	if (key.matches("\\.")) {
70	String new_key = key.replaceAll("\\.", "<PERIOD>");
71	ef_count.put(new_key, ef_count.get(key));
72	ef_count.remove(key);
73	key = new_key;
74	}
75
76	if (key.matches("\\$")) {
77	String new_key = key.replaceAll("\\$", "<DOLLAR>");
78	ef_count.put(new_key, ef_count.get(key));
79	ef_count.remove(key);
80	}
81
82	}
83	}
84
85	protected void fixup_page(String volume_id, String page_id, Document ef_page)
86	{
87	if (ef_page != null) {
88	String[] zone_keys = { "header", "body", "footer" };
89
90	for (String zone_key: zone_keys) {
91	Document ef_zone = (Document)ef_page.get(zone_key);
92	if (ef_zone != null) {
93	String[] count_keys = { "beginCharCounts", "endCharCount", "tokenPosCount" };
94
95	for (String count_key: count_keys) {
96	Document ef_sub_section = (Document)ef_zone.get(count_key);
97	if (ef_sub_section != null) {
98	fixup_section(ef_sub_section);
99
100	if (count_key.equals("tokenPostCount")) {
101	Set<String> key_set = ef_sub_section.keySet();
102	for (String key : key_set) {
103	Document token_section = (Document)ef_sub_section.get(key);
104	fixup_section(token_section);
105	}
106	}
107
108
109	}
110	}
111	}
112	}
113	}
114	else {
115	System.err.println("Warning: null page for '" + page_id + "'");
116	}
117
118	}
119	protected void fixup_volume(String json_file_in, Document extracted_feature_record)
120	{
121	String full_json_file_in = _input_dir + "/" + json_file_in;
122
123	if (extracted_feature_record != null) {
124	String volume_id = extracted_feature_record.getString("id");
125	extracted_feature_record.put("_id",volume_id);
126	extracted_feature_record.remove("id");
127
128	Document ef_features = (Document)extracted_feature_record.get("features");
129
130	int ef_page_count = ef_features.getInteger("pageCount");
131
132	if (_verbosity >= 1) {
133	System.out.println("Processing: " + json_file_in);
134	System.out.println(" pageCount = " + ef_page_count);
135	}
136
137	List<Document> ef_pages = (List<Document>)ef_features.get("pages");
138	int ef_num_pages = ef_pages.size();
139	if (ef_num_pages != ef_page_count) {
140	System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
141	+" does not match 'pageCount' metadata (" + ef_page_count + ")");
142	}
143
144	if (_verbosity >= 2) {
145	System.out.print(" Pages: ");
146	}
147
148	for (int i = 0; i < ef_page_count; i++) {
149	String formatted_i = String.format("page-%06d", i);
150	String page_id = volume_id + "." + formatted_i;
151
152	if (_verbosity >= 2) {
153	if (i>0) {
154	System.out.print(", ");
155	}
156	System.out.print(page_id);
157	}
158
159	if (i==(ef_page_count-1)) {
160	if (_verbosity >= 2) {
161	System.out.println();
162	}
163	}
164
165	Document ef_page = (Document)ef_pages.get(i);
166
167	if (ef_page != null) {
168
169	fixup_page(volume_id, page_id, ef_page);
170	}
171	else {
172	System.err.println("Skipping: " + page_id);
173	}
174	}
175	}
176	else {
177	// File did not exist, or could not be parsed
178	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
179
180	System.err.println("Warning: " + mess);
181	System.out.println("Warning: " + mess);
182
183	}
184	}
185
186	public Integer call(String json_file_in) throws IOException
187	{
188	try {
189	MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
190	MongoClient mongoClient = new MongoClient(mongo_url);
191
192	MongoDatabase database = mongoClient.getDatabase("htrc_ef");
193	MongoCollection<Document> collection = database.getCollection("volumes");
194
195	String full_json_file_in = _input_dir + "/" + json_file_in;
196	System.out.println("Processing: " + full_json_file_in);
197	String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
198
199	Document doc = Document.parse(extracted_feature_json_doc);
200
201	fixup_volume(json_file_in,doc);
202
203	collection.insertOne(doc);
204
205	/*
206	//Mongo mongo = new Mongo("localhost", 27017);
207	MongoClient mongo = new MongoClient( "localhost" , 27017 );
208
209	DB db = mongo.getDB("yourdb");
210	DBCollection coll = db.getCollection("dummyColl");
211
212	// convert JSON to DBObject directly
213	DBObject dbObject = (DBObject) JSON
214	.parse("{'name':'mkyong', 'age':30}");
215	coll.insert(dbObject);
216
217
218	DBCursor cursorDoc = coll.find();
219	while (cursorDoc.hasNext()) {
220	System.out.println(cursorDoc.next());
221	}
222
223	System.out.println("Done");
224	*/
225	mongoClient.close();
226
227	} catch (MongoException e) {
228	e.printStackTrace();
229	}
230
231	return 1;
232	}
233	public Integer callPageCount(String json_file_in) throws IOException
234	{
235	Integer page_count = 0;
236
237	String full_json_file_in = _input_dir + "/" + json_file_in;
238	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
239
240	if (extracted_feature_record != null) {
241	String volume_id = extracted_feature_record.getString("id");
242
243	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
244
245	if (_verbosity >= 1) {
246	System.out.println("Processing: " + json_file_in);
247	}
248
249	if (ef_features != null) {
250	String page_count_str = ef_features.getString("pageCount");
251	if (!page_count_str.equals("")) {
252	page_count = Integer.parseInt(page_count_str);
253	}
254	else {
255	System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
256	}
257	}
258	else {
259	System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
260	}
261
262	}
263	else {
264	// File did not exist, or could not be parsed
265	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
266	if (_strict_file_io) {
267	throw new IOException(mess);
268	}
269	else {
270	System.err.println("Warning: " + mess);
271	System.out.println("Warning: " + mess);
272	}
273	}
274
275	_progress_accum.add(_progress_step);
276
277	return page_count;
278	}
279
280
281	}
282

Note: See TracBrowser for help on using the repository browser.

Download in other formats: