source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31310

Last change on this file since 31310 was 31310, checked in by davidb, 7 years ago

Initial cut at files for working with MongoDB

  • Property svn:executable set to *
File size: 3.7 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import org.apache.spark.api.java.function.Function;
5import org.apache.spark.util.DoubleAccumulator;
6import org.bson.Document;
7import org.json.JSONObject;
8
9import com.mongodb.MongoClient;
10import com.mongodb.MongoClientURI;
11import com.mongodb.MongoException;
12import com.mongodb.client.MongoCollection;
13import com.mongodb.client.MongoDatabase;
14
15
16class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
17{
18 private static final long serialVersionUID = 1L;
19
20 protected String _input_dir;
21 protected int _verbosity;
22
23 protected DoubleAccumulator _progress_accum;
24 protected double _progress_step;
25
26 boolean _strict_file_io;
27
28 public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
29 DoubleAccumulator progress_accum, double progress_step,
30 boolean strict_file_io)
31 {
32 _input_dir = input_dir;
33 _verbosity = verbosity;
34
35 _progress_accum = progress_accum;
36 _progress_step = progress_step;
37
38 _strict_file_io = strict_file_io;
39 }
40
41 public Integer call(String json_file_in) throws IOException
42 {
43 try {
44 MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
45 MongoClient mongoClient = new MongoClient(mongo_url);
46
47 MongoDatabase database = mongoClient.getDatabase("htrc_ef");
48 MongoCollection<Document> collection = database.getCollection("volumes");
49
50 String full_json_file_in = _input_dir + "/" + json_file_in;
51 String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
52
53 Document doc = Document.parse(extracted_feature_json_doc);
54 collection.insertOne(doc);
55
56 /*
57 //Mongo mongo = new Mongo("localhost", 27017);
58 MongoClient mongo = new MongoClient( "localhost" , 27017 );
59
60 DB db = mongo.getDB("yourdb");
61 DBCollection coll = db.getCollection("dummyColl");
62
63 // convert JSON to DBObject directly
64 DBObject dbObject = (DBObject) JSON
65 .parse("{'name':'mkyong', 'age':30}");
66 coll.insert(dbObject);
67
68
69 DBCursor cursorDoc = coll.find();
70 while (cursorDoc.hasNext()) {
71 System.out.println(cursorDoc.next());
72 }
73
74 System.out.println("Done");
75*/
76 mongoClient.close();
77
78 } catch (MongoException e) {
79 e.printStackTrace();
80 }
81
82 return 1;
83 }
84 public Integer callPageCount(String json_file_in) throws IOException
85 {
86 Integer page_count = 0;
87
88 String full_json_file_in = _input_dir + "/" + json_file_in;
89 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
90
91 if (extracted_feature_record != null) {
92 String volume_id = extracted_feature_record.getString("id");
93
94 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
95
96 if (_verbosity >= 1) {
97 System.out.println("Processing: " + json_file_in);
98 }
99
100 if (ef_features != null) {
101 String page_count_str = ef_features.getString("pageCount");
102 if (!page_count_str.equals("")) {
103 page_count = Integer.parseInt(page_count_str);
104 }
105 else {
106 System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
107 }
108 }
109 else {
110 System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
111 }
112
113 }
114 else {
115 // File did not exist, or could not be parsed
116 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
117 if (_strict_file_io) {
118 throw new IOException(mess);
119 }
120 else {
121 System.err.println("Warning: " + mess);
122 System.out.println("Warning: " + mess);
123 }
124 }
125
126 _progress_accum.add(_progress_step);
127
128 return page_count;
129 }
130
131
132}
133
Note: See TracBrowser for help on using the repository browser.