Context Navigation

source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31317

Last change on this file since 31317 was 31317, checked in by davidb, 7 years ago
added debug statement
Property svn:executable set to ``*
File size: 8.0 KB

Line
1	package org.hathitrust.extractedfeatures;
2
3	import java.io.IOException;
4	import java.io.Reader;
5	import java.io.StringReader;
6	import java.util.ArrayList;
7	import java.util.Iterator;
8	import java.util.List;
9	import java.util.Set;
10
11	import org.apache.lucene.analysis.TokenStream;
12	import org.apache.lucene.analysis.core.LowerCaseFilter;
13	import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer;
14	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15	import org.apache.spark.api.java.function.Function;
16	import org.apache.spark.util.DoubleAccumulator;
17	import org.bson.Document;
18	import org.json.JSONArray;
19	import org.json.JSONObject;
20
21	import com.mongodb.MongoClient;
22	import com.mongodb.MongoClientURI;
23	import com.mongodb.MongoException;
24	import com.mongodb.client.MongoCollection;
25	import com.mongodb.client.MongoDatabase;
26
27
28	class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
29	{
30	private static final long serialVersionUID = 1L;
31
32	protected String _input_dir;
33	protected int _verbosity;
34
35	protected DoubleAccumulator _progress_accum;
36	protected double _progress_step;
37
38	boolean _strict_file_io;
39
40	public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
41	DoubleAccumulator progress_accum, double progress_step,
42	boolean strict_file_io)
43	{
44	_input_dir = input_dir;
45	_verbosity = verbosity;
46
47	_progress_accum = progress_accum;
48	_progress_step = progress_step;
49
50	_strict_file_io = strict_file_io;
51	}
52
53	protected void fixup_section(Document ef_count)
54	{
55
56	Set<String> key_set = ef_count.keySet();
57	String[] key_array = key_set.toArray(new String[key_set.size()]);
58
59
60	//Set<String> key_set = ef_count.keySet();
61	//for (String key : key_set) {
62
63	//Iterator<String> key_iterator = ef_count.keySet().iterator();
64	//while (key_iterator.hasNext()) {
65	for (int i=0; i<key_array.length; i++) {
66
67	String key = key_array[i];
68	//String key = key_iterator.next();
69	if (key.matches("\\.")) {
70	String new_key = key.replaceAll("\\.", "<PERIOD>");
71	System.out.println("**** old key:" + key + "=> new key:" + new_key);
72	ef_count.put(new_key, ef_count.get(key));
73	ef_count.remove(key);
74	key = new_key;
75	}
76
77	if (key.matches("\\$")) {
78	String new_key = key.replaceAll("\\$", "<DOLLAR>");
79	ef_count.put(new_key, ef_count.get(key));
80	ef_count.remove(key);
81	}
82
83	}
84	}
85
86	protected void fixup_page(String volume_id, String page_id, Document ef_page)
87	{
88	if (ef_page != null) {
89	String[] zone_keys = { "header", "body", "footer" };
90
91	for (String zone_key: zone_keys) {
92	Document ef_zone = (Document)ef_page.get(zone_key);
93	if (ef_zone != null) {
94	String[] count_keys = { "beginCharCounts", "endCharCount", "tokenPosCount" };
95
96	for (String count_key: count_keys) {
97	Document ef_sub_section = (Document)ef_zone.get(count_key);
98	if (ef_sub_section != null) {
99	fixup_section(ef_sub_section);
100
101	if (count_key.equals("tokenPosCount")) {
102	Set<String> key_set = ef_sub_section.keySet();
103	for (String key : key_set) {
104	Document token_section = (Document)ef_sub_section.get(key);
105	fixup_section(token_section);
106	}
107	}
108
109
110	}
111	}
112	}
113	}
114	}
115	else {
116	System.err.println("Warning: null page for '" + page_id + "'");
117	}
118
119	}
120	protected void fixup_volume(String json_file_in, Document extracted_feature_record)
121	{
122	String full_json_file_in = _input_dir + "/" + json_file_in;
123
124	if (extracted_feature_record != null) {
125	String volume_id = extracted_feature_record.getString("id");
126	extracted_feature_record.put("_id",volume_id);
127	extracted_feature_record.remove("id");
128
129	Document ef_features = (Document)extracted_feature_record.get("features");
130
131	int ef_page_count = ef_features.getInteger("pageCount");
132
133	if (_verbosity >= 1) {
134	System.out.println("Processing: " + json_file_in);
135	System.out.println(" pageCount = " + ef_page_count);
136	}
137
138	List<Document> ef_pages = (List<Document>)ef_features.get("pages");
139	int ef_num_pages = ef_pages.size();
140	if (ef_num_pages != ef_page_count) {
141	System.err.println("Warning: number of page elements in JSON (" + ef_num_pages + ")"
142	+" does not match 'pageCount' metadata (" + ef_page_count + ")");
143	}
144
145	if (_verbosity >= 2) {
146	System.out.print(" Pages: ");
147	}
148
149	for (int i = 0; i < ef_page_count; i++) {
150	String formatted_i = String.format("page-%06d", i);
151	String page_id = volume_id + "." + formatted_i;
152
153	if (_verbosity >= 2) {
154	if (i>0) {
155	System.out.print(", ");
156	}
157	System.out.print(page_id);
158	}
159
160	if (i==(ef_page_count-1)) {
161	if (_verbosity >= 2) {
162	System.out.println();
163	}
164	}
165
166	Document ef_page = (Document)ef_pages.get(i);
167
168	if (ef_page != null) {
169
170	fixup_page(volume_id, page_id, ef_page);
171	}
172	else {
173	System.err.println("Skipping: " + page_id);
174	}
175	}
176	}
177	else {
178	// File did not exist, or could not be parsed
179	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
180
181	System.err.println("Warning: " + mess);
182	System.out.println("Warning: " + mess);
183
184	}
185	}
186
187	public Integer call(String json_file_in) throws IOException
188	{
189	try {
190	MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
191	MongoClient mongoClient = new MongoClient(mongo_url);
192
193	MongoDatabase database = mongoClient.getDatabase("htrc_ef");
194	MongoCollection<Document> collection = database.getCollection("volumes");
195
196	String full_json_file_in = _input_dir + "/" + json_file_in;
197	System.out.println("Processing: " + full_json_file_in);
198	String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
199
200	Document doc = Document.parse(extracted_feature_json_doc);
201
202	fixup_volume(json_file_in,doc);
203
204	collection.insertOne(doc);
205
206	/*
207	//Mongo mongo = new Mongo("localhost", 27017);
208	MongoClient mongo = new MongoClient( "localhost" , 27017 );
209
210	DB db = mongo.getDB("yourdb");
211	DBCollection coll = db.getCollection("dummyColl");
212
213	// convert JSON to DBObject directly
214	DBObject dbObject = (DBObject) JSON
215	.parse("{'name':'mkyong', 'age':30}");
216	coll.insert(dbObject);
217
218
219	DBCursor cursorDoc = coll.find();
220	while (cursorDoc.hasNext()) {
221	System.out.println(cursorDoc.next());
222	}
223
224	System.out.println("Done");
225	*/
226	mongoClient.close();
227
228	} catch (MongoException e) {
229	e.printStackTrace();
230	}
231
232	return 1;
233	}
234	public Integer callPageCount(String json_file_in) throws IOException
235	{
236	Integer page_count = 0;
237
238	String full_json_file_in = _input_dir + "/" + json_file_in;
239	JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
240
241	if (extracted_feature_record != null) {
242	String volume_id = extracted_feature_record.getString("id");
243
244	JSONObject ef_features = extracted_feature_record.getJSONObject("features");
245
246	if (_verbosity >= 1) {
247	System.out.println("Processing: " + json_file_in);
248	}
249
250	if (ef_features != null) {
251	String page_count_str = ef_features.getString("pageCount");
252	if (!page_count_str.equals("")) {
253	page_count = Integer.parseInt(page_count_str);
254	}
255	else {
256	System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
257	}
258	}
259	else {
260	System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
261	}
262
263	}
264	else {
265	// File did not exist, or could not be parsed
266	String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
267	if (_strict_file_io) {
268	throw new IOException(mess);
269	}
270	else {
271	System.err.println("Warning: " + mess);
272	System.out.println("Warning: " + mess);
273	}
274	}
275
276	_progress_accum.add(_progress_step);
277
278	return page_count;
279	}
280
281
282	}
283

Note: See TracBrowser for help on using the repository browser.

Download in other formats: