source: other-projects/hathitrust/wcsa/extracted-features-solr/trunk/solr-ingest/src/main/java/org/hathitrust/extractedfeatures/PerVolumeMongoDBDocumentsMap.java@ 31311

Last change on this file since 31311 was 31311, checked in by davidb, 7 years ago

Processing print statement added

  • Property svn:executable set to *
File size: 3.8 KB
Line 
1package org.hathitrust.extractedfeatures;
2
3import java.io.IOException;
4import org.apache.spark.api.java.function.Function;
5import org.apache.spark.util.DoubleAccumulator;
6import org.bson.Document;
7import org.json.JSONObject;
8
9import com.mongodb.MongoClient;
10import com.mongodb.MongoClientURI;
11import com.mongodb.MongoException;
12import com.mongodb.client.MongoCollection;
13import com.mongodb.client.MongoDatabase;
14
15
16class PerVolumeMongoDBDocumentsMap implements Function<String, Integer>
17{
18 private static final long serialVersionUID = 1L;
19
20 protected String _input_dir;
21 protected int _verbosity;
22
23 protected DoubleAccumulator _progress_accum;
24 protected double _progress_step;
25
26 boolean _strict_file_io;
27
28 public PerVolumeMongoDBDocumentsMap(String input_dir, int verbosity,
29 DoubleAccumulator progress_accum, double progress_step,
30 boolean strict_file_io)
31 {
32 _input_dir = input_dir;
33 _verbosity = verbosity;
34
35 _progress_accum = progress_accum;
36 _progress_step = progress_step;
37
38 _strict_file_io = strict_file_io;
39 }
40
41 public Integer call(String json_file_in) throws IOException
42 {
43 try {
44 MongoClientURI mongo_url = new MongoClientURI("mongodb://gc3:27017,gc4:27017,gc5:27017");
45 MongoClient mongoClient = new MongoClient(mongo_url);
46
47 MongoDatabase database = mongoClient.getDatabase("htrc_ef");
48 MongoCollection<Document> collection = database.getCollection("volumes");
49
50 String full_json_file_in = _input_dir + "/" + json_file_in;
51 System.out.println("Processing: " + full_json_file_in);
52 String extracted_feature_json_doc = ClusterFileIO.readTextFile(full_json_file_in);
53
54 Document doc = Document.parse(extracted_feature_json_doc);
55 collection.insertOne(doc);
56
57 /*
58 //Mongo mongo = new Mongo("localhost", 27017);
59 MongoClient mongo = new MongoClient( "localhost" , 27017 );
60
61 DB db = mongo.getDB("yourdb");
62 DBCollection coll = db.getCollection("dummyColl");
63
64 // convert JSON to DBObject directly
65 DBObject dbObject = (DBObject) JSON
66 .parse("{'name':'mkyong', 'age':30}");
67 coll.insert(dbObject);
68
69
70 DBCursor cursorDoc = coll.find();
71 while (cursorDoc.hasNext()) {
72 System.out.println(cursorDoc.next());
73 }
74
75 System.out.println("Done");
76*/
77 mongoClient.close();
78
79 } catch (MongoException e) {
80 e.printStackTrace();
81 }
82
83 return 1;
84 }
85 public Integer callPageCount(String json_file_in) throws IOException
86 {
87 Integer page_count = 0;
88
89 String full_json_file_in = _input_dir + "/" + json_file_in;
90 JSONObject extracted_feature_record = JSONClusterFileIO.readJSONFile(full_json_file_in);
91
92 if (extracted_feature_record != null) {
93 String volume_id = extracted_feature_record.getString("id");
94
95 JSONObject ef_features = extracted_feature_record.getJSONObject("features");
96
97 if (_verbosity >= 1) {
98 System.out.println("Processing: " + json_file_in);
99 }
100
101 if (ef_features != null) {
102 String page_count_str = ef_features.getString("pageCount");
103 if (!page_count_str.equals("")) {
104 page_count = Integer.parseInt(page_count_str);
105 }
106 else {
107 System.err.println("No 'pageCount' in 'features' in volume id '" + volume_id + "' => defaulting to 0");
108 }
109 }
110 else {
111 System.err.println("No 'features' section in JSON file => Skipping id: " + volume_id);
112 }
113
114 }
115 else {
116 // File did not exist, or could not be parsed
117 String mess = "Failed to read in bzipped JSON file '" + full_json_file_in + "'";
118 if (_strict_file_io) {
119 throw new IOException(mess);
120 }
121 else {
122 System.err.println("Warning: " + mess);
123 System.out.println("Warning: " + mess);
124 }
125 }
126
127 _progress_accum.add(_progress_step);
128
129 return page_count;
130 }
131
132
133}
134
Note: See TracBrowser for help on using the repository browser.