source: other-projects/maori-lang-detection/src/org/greenstone/atea/MongoDBAccess.java@ 33906

Last change on this file since 33906 was 33906, checked in by ak19, 4 years ago

Code is intermediate state. 1. Introduced basicDomain field to MongoDB and recreated the MongoDB tables/collections, this will help discount duplicated domains under http and https, with and without www. Though webpage URLs may potentially still be unique and not duplicated across all 4 possible variants, I want them counted under the same base domain name. 2. Another issue noticed now is that some of the sites appear to be hosted on multiple countries servers, and so slightly different country code counts and domainlistings are returned. 3. So added code modifications (untested) to sort the domains alphabetically after stripping protocol and www to allow comparing the old domainListing results of MongoDB's now renamed oldWebsites and oldWebpages collections to the new versions of these collections and to then update the differences in manual counts.

File size: 27.7 KB
Line 
1package org.greenstone.atea;
2
3//import org.bson.BSONObject;
4
5import com.mongodb.client.AggregateIterable;
6import com.mongodb.client.MongoCollection;
7import com.mongodb.client.MongoDatabase;
8//import com.mongodb.client.MongoIterable;
9
10// to use collection.find() filters like eq(), regex() etc
11import static com.mongodb.client.model.Filters.*;
12// to use collection.find().projection() filters like include() etc
13import static com.mongodb.client.model.Projections.*;
14// to use aggregation functions like unwind(), match(), sort() etc
15import static com.mongodb.client.model.Aggregates.*;
16// to use functions like sum() and addToSet() within aggregation functions
17import static com.mongodb.client.model.Accumulators.*;
18
19//import org.bson.conversions.Bson;
20import com.mongodb.BasicDBObject;
21import com.mongodb.MongoClient;
22import com.mongodb.MongoCredential;
23import com.mongodb.ServerAddress;
24import com.mongodb.MongoClientOptions;
25
26import com.mongodb.Block;
27
28import org.bson.BsonArray;
29import org.bson.BsonString;
30import org.bson.Document;
31import org.bson.conversions.Bson;
32import org.bson.json.JsonMode;
33import org.bson.json.JsonWriterSettings;
34
35import com.mongodb.util.JSON;
36//import com.mongodb.DBObject;
37
38
39import com.google.gson.*; // for pretty printing
40
41import java.io.BufferedReader;
42import java.io.BufferedWriter;
43import java.io.File;
44import java.io.FileReader;
45import java.io.FileWriter;
46import java.io.IOException;
47import java.io.UncheckedIOException;
48import java.io.Writer;
49
50import java.util.Arrays;
51import java.util.ArrayList;
52import java.util.List;
53import java.util.Properties;
54import java.util.regex.Pattern;
55
56import org.apache.log4j.Logger;
57
58import org.greenstone.atea.morphia.*;
59import dev.morphia.*;
60
61import org.apache.commons.csv.*;
62
63/**
64 * https://www.tutorialspoint.com/mongodb/mongodb_java.htm
65 *
66 * TO COMPILE:
67 * maori-lang-detection/src$
68 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/MongoDBAccess.java
69 *
70 * TO RUN:
71 * java -cp ".:../conf:../lib/*" org.greenstone.atea.MongoDBAccess
72 *
73 * Manually connecting to mongodb from client:
74 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017' -u USERNAME -p
75 * Then after connecting with pwd, type:
76 * use DBNAME
77 *
78 * Or connect to mongodb and specify db in one statement:
79 * mongo 'mongodb://mongodb.cms.waikato.ac.nz:27017/DBNAME?authSource=admin' -u USERNAME -p
80 *
81 * Some links:
82 * - https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
83 * - https://docs.mongodb.com/manual/reference/glossary/ (particularly "collection")
84 * - https://tecadmin.net/tutorial/mongodb/drop-collection/
85 * IMPORTANT LINK:
86 * - https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
87 *
88 * API:
89 * - https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/MongoCollection.html#find--
90 * - examples: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
91 */
92public class MongoDBAccess implements AutoCloseable {
93
94 private static Logger logger = Logger.getLogger(org.greenstone.atea.MongoDBAccess.class.getName());
95
96 static final String PROPS_FILENAME = "config.properties";
97 public static final String WEBPAGES_COLLECTION = "Webpages";
98 public static final String WEBSITES_COLLECTION = "Websites";
99
100 public static final String NEWLINE = System.getProperty("line.separator");
101
102 /** mongodb filter types to execute */
103 public static final int IS_MRI = 0;
104 public static final int CONTAINS_MRI = 1;
105
106 /** Some reused fieldnames in the Websites collection */
107 private static final String FILTER_NUMPAGES_IN_MRI = "numPagesInMRI";
108 private static final String FILTER_NUMPAGES_CONTAINING_MRI = "numPagesContainingMRI";
109
110 // configuration details, some with fallback values
111 private String HOST = "localhost";
112 private int PORT = 27017; // mongodb port
113 private String USERNAME;
114 private String PASSWORD;
115 private String DB_NAME ="ateacrawldata";
116
117 private MongoClient mongo = null;
118 private MongoDatabase database = null;
119
120 /**
121 * Mongodb Client handle via morphia, which handles the ODM (object document mapper)
122 * for MongoDB
123 */
124 public Datastore datastore = null;
125
126 public MongoDBAccess() throws Exception {
127 boolean success = false;
128
129 // Read in the username and password from our props file
130 Properties props = new Properties();
131
132 //File propsFile = new File(PROPS_FILENAME);
133 //logger.debug("*** Conf props filename: " + propsFile.getAbsolutePath());
134 try {
135 props.load(getClass().getClassLoader().getResourceAsStream(PROPS_FILENAME));
136 } catch(Exception e) {
137 logger.error(e);
138 }
139
140
141 USERNAME = props.getProperty("mongodb.user", "");
142 if(USERNAME.equals("")) {
143 USERNAME = "root";
144 logger.warn("WARNING: No sensible value for mongodb.user specified in " + PROPS_FILENAME + ". Attempting to use: " + USERNAME);
145 }
146 PASSWORD = props.getProperty("mongodb.pwd");
147
148 logger.debug("Got pwd: " + PASSWORD);
149
150 if(PASSWORD != null && PASSWORD.equals("CHANGEME")) {
151
152 success = false;
153 throw new Exception("************ FATAL ERROR: Change DB password in properties file " + PROPS_FILENAME);
154 }
155
156 HOST = props.getProperty("mongodb.host", HOST);
157 String port = props.getProperty("mongodb.port", Integer.toString(PORT));
158 PORT = Integer.parseInt(port);
159 DB_NAME = props.getProperty("mongodb.dbname", DB_NAME);
160
161 logger.info("Connecting to mongodb with:");
162 logger.info(" - host: " + HOST);
163 logger.info(" - port: " + PORT);
164 logger.info(" - user: " + USERNAME);
165 logger.info(" - db name: " + DB_NAME);
166 }
167
168 /**
169 * Since we have only a single MongoClient, don't need to call close/disconnect on it as per
170 * https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection
171 */
172 public void connectToDB() throws Exception {
173
174 // Creating a Mongo client
175 mongo = new MongoClient( HOST, PORT );
176
177 // Creating Credentials
178 MongoCredential credential;
179 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
180 System.out.println("Connected to the database successfully");
181
182 // Accessing the database
183 this.database = mongo.getDatabase(DB_NAME);
184 logger.info("Credentials: "+ credential);
185
186 /*
187 MongoCredential credential;
188 credential = MongoCredential.createCredential(USERNAME, DB_NAME, PASSWORD.toCharArray());
189 logger.info("Credentials: "+ credential);
190
191 // Create our Mongo client
192 mongo = new MongoClient( new ServerAddress(HOST, PORT), credential, new MongoClientOptions.Builder().build());
193 System.out.println("Connected to the database successfully");
194
195 this.database = mongo.getDatabase(DB_NAME);
196 */
197
198 Morphia morphia = new Morphia();
199 morphia.mapPackage("com.greenstone.atea.morphia");
200 datastore = morphia.createDatastore(mongo, DB_NAME);
201 datastore.ensureIndexes();
202
203 }
204
205 // TODO: which fields should be indexed?
206
207 public void showCollections() {
208 //MongoIterable<String> colls = this.database.listCollectionNames();
209 for(String coll : this.database.listCollectionNames()) {
210 System.err.println("coll: " + coll);
211 }
212 }
213
214 /*
215 public void insertWebsiteInfo(WebsiteInfo website)
216 {
217 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
218 Document document = new Document("_id", website.id)
219 .append("siteFolderName", website.siteFolderName)
220 .append("domain", website.domain)
221 .append("basicDomain", website.basicDomain)
222 .append("totalPages", website.totalPages)
223 .append("numPagesWithBodyText", website.countOfWebPagesWithBodyText)
224 .append("numPagesInMRI", website.numPagesInMRI)
225 .append("siteCrawledTimestamp", website.siteCrawledTimestamp)
226 .append("siteCrawlUnfinished", website.siteCrawlUnfinished)
227 .append("redoCrawl", website.redoCrawl);
228
229 document.put("urlContainsLangCodeInpath", website.urlContainsLangCodeInpath);
230 if(website.geoLocationCountryCode != null && !website.geoLocationCountryCode.equals("")) {
231 document.put("countryCode", website.geoLocationCountryCode);
232 }
233
234 collection.insertOne(document);
235 logger.debug("Website info for " + website.id + "(" + website.siteFolderName + ")"
236 + " inserted successfully into " + WEBSITES_COLLECTION);
237 }
238 */
239
240 /**
241 * Inserts a web page into the mongodb. Besides page related metadata and full body text
242 * the language information per sentence and per 2 adjacent sentences also get stored
243 * into the mongodb.
244 */
245 /*
246 public void insertWebpageInfo(WebpageInfo webpage)
247 {
248 int mri_sentence_count = 0;
249
250 // load the webpages db 'table'
251 // in mongodb, the equivalent of db tables are called 'collections'
252 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
253
254 Document document = new Document("_id", webpage.webpageID)
255 .append("siteid", webpage.websiteID)
256 .append("url", webpage.URL)
257 .append("isMRI", webpage.isMRI)
258 .append("totalSentences", webpage.totalSentences)
259 .append("charEncoding", webpage.charEncoding)
260 .append("modTime", webpage.modifiedTime)
261 .append("fetchTime", webpage.fetchTime);
262
263 // INSTEAD, ARRAY OF OBJECTS TO BE INSERTED AS PER:
264 // https://stackoverflow.com/questions/15371839/how-to-add-an-array-to-a-mongodb-document-using-java
265 List<BasicDBObject> sentencesList = new ArrayList<>();
266 for(SentenceInfo sentenceInfo : webpage.singleSentences) {
267
268 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
269
270 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
271 bsonRecord.put("sentence", sentenceInfo.sentence);
272
273 sentencesList.add(bsonRecord);
274
275 if(sentenceInfo.langCode.equals(MaoriTextDetector.MAORI_3LETTER_CODE)) {
276 mri_sentence_count++;
277 }
278
279 }
280 document.put("singleSentences", sentencesList);
281
282 List<BasicDBObject> overlappingSentencesList = new ArrayList<>();
283 for(SentenceInfo sentenceInfo : webpage.overlappingSentences) {
284
285 BasicDBObject bsonRecord = new BasicDBObject("langCode", sentenceInfo.langCode);
286 bsonRecord.put("confidence", sentenceInfo.confidenceLevel);
287 bsonRecord.put("sentence", sentenceInfo.sentence);
288
289 overlappingSentencesList.add(bsonRecord);
290 }
291 document.put("overlappingSentences", overlappingSentencesList);
292
293 // also put the full text in there
294 document.put("text", webpage.text);
295
296 // also store the count of sentences in MRI
297 webpage.setMRISentenceCount(mri_sentence_count);
298 document.put("mriSentenceCount", mri_sentence_count);
299
300
301 collection.insertOne(document);
302 logger.debug("\nwebpage info for " + webpage.webpageID + " inserted successfully into " + WEBPAGES_COLLECTION);
303 }
304 */
305
306 public ArrayList<String> queryAllMatchingIsMRIURLs(String domain) {
307 return queryAllMatchingURLsFilteredBy(domain, IS_MRI);
308 }
309 public ArrayList<String> queryAllMatchingcontainsMRIURLs(String domain) {
310 return queryAllMatchingURLsFilteredBy(domain, CONTAINS_MRI);
311 }
312
313 /**
314 * Java mongodb find: https://mongodb.github.io/mongo-java-driver/3.4/driver/getting-started/quick-start/
315 * Java mongodb find filters: https://mongodb.github.io/mongo-java-driver/3.4/javadoc/?com/mongodb/client/model/Filters.html
316 * Java mongodb projection: https://stackoverflow.com/questions/44894497/retrieving-data-with-mongodb-java-driver-3-4-using-find-method-with-projection
317 * mongodb projection: https://docs.mongodb.com/v3.2/reference/method/db.collection.find/#db.collection.find
318 *
319 * Parse MongoDB query into Java: https://stackoverflow.com/questions/17326747/parsing-strings-to-mongodb-query-documents-with-operators-in-java
320 * Maybe also https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
321 * https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java
322 * http://pingax.com/trick-convert-mongo-shell-query-equivalent-java-objects/
323 */
324 public ArrayList<String> queryAllMatchingURLsFilteredBy(String domain, int filterType) {
325
326 final ArrayList<String> urlsList = new ArrayList<String>();
327
328 // remove any http(s)://(www.) from the start of URL first
329 // since it goes into a regex
330 domain = Utility.stripProtocolAndWWWFromURL(domain);
331
332 // load the "webpages" db table
333 // in mongodb, the equivalent of db tables are called 'collections'
334 MongoCollection<Document> collection = this.database.getCollection(WEBPAGES_COLLECTION);
335
336 // code we'll execute in Iterable.forEach() below
337 // see also https://www.baeldung.com/foreach-java
338 Block<Document> storeURL = new Block<Document>() {
339 @Override
340 public void apply(final Document document) {
341 //System.out.println(document.toJson());
342 String url = document.getString("URL");
343 // add to our urlsList
344 //System.out.println(url);
345 urlsList.add(url);
346 }
347 };
348
349
350 // Run the following mongodb query:
351 // db.getCollection('Webpages').find({URL: /domain/, isMRI: true}, {URL: 1, _id: 0})
352
353 // 1. One way that works:
354 //collection.find(and(eq("isMRI", true), regex("URL", pattern))).projection(fields(include("URL"), excludeId())).forEach(storeURL);
355
356 // 2. Another way:
357 //String query = "{URL: /DOMAIN/, isMRI: true}";
358 String query = "{URL: /DOMAIN/, ";
359 if(filterType == IS_MRI) {
360 query += "isMRI: true}";
361 } else if(filterType == CONTAINS_MRI) {
362 query += "containsMRI: true}";
363 }
364
365 domain = domain.replace(".", "\\."); // escape dots in domain for regex
366 query = query.replace("DOMAIN", domain);
367
368 //System.err.println("Executing find query: " + query);
369
370 BasicDBObject findObj = BasicDBObject.parse(query);
371 BasicDBObject projectionObj = BasicDBObject.parse("{URL: 1, _id: 0}");
372
373
374 collection.find(findObj).projection(projectionObj).forEach(storeURL);
375
376 return urlsList;
377 }
378
379 /**
380 * RUNNING A MONGODB COLLECTION.AGGREGATE() in JAVA:
381 *
382 * https://stackoverflow.com/questions/31643109/mongodb-aggregation-with-java-driver
383 * https://stackoverflow.com/questions/48000891/parse-mongodb-json-query-in-java-with-multiple-criteria
384 * Not Java: https://stackoverflow.com/questions/39060221/a-pipeline-stage-specification-object-must-contain-exactly-one-field-with-php-mo
385 *
386 * (https://stackoverflow.com/questions/55029222/parse-mongodb-query-to-java)
387 * https://www.programcreek.com/java-api-examples/?api=com.mongodb.client.model.Aggregates
388 * On using group(TExpression) inside collection.aggregate().
389 *
390 * For forEach lamba expressions, see also https://www.baeldung.com/foreach-java
391 * and https://www.javatpoint.com/java-8-foreach
392 * and https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
393 *
394 * Count by country code of non-NZ websites containing a positive number of sentences in MRI,
395 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
396 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
397 * matching sites.
398 *
399 * The mongodb aggregate() we want to run this time:
400 *
401 db.Websites.aggregate([
402 {
403 $match: {
404 $and: [
405 {numPagesContainingMRI: {$gt: 0}},
406 {$or: [{geoLocationCountryCode:"NZ"},{domain: /\.nz/}]}
407 ]
408 }
409 },
410 { $unwind: "$geoLocationCountryCode" },
411 {
412 $group: {
413 _id: "nz",
414 count: { $sum: 1 },
415 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
416 }
417 },
418 { $sort : { count : -1} }
419 ]);
420 */
421 public void aggregateContainsMRIForNZ(Writer writer, int filterType) throws IOException {
422 // working with the WebSites collection, not WebPages collection!
423 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
424
425 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
426
427 Bson orQuery = or(
428 BasicDBObject.parse("{geoLocationCountryCode: \"NZ\"}"),
429 BasicDBObject.parse("{domain: /\\.nz/}")
430 );
431 Bson andQuery = and(
432 BasicDBObject.parse(mriFilterString),
433 orQuery);
434
435 // Hopefully the lambda expression (forEach()) at end means
436 // we write out each result Document as we get it
437 collection.aggregate(Arrays.asList(
438 match(andQuery),
439 unwind("$geoLocationCountryCode"),
440 group("NZ", Arrays.asList(sum("count", 1),
441 addToSet("domain", "$basicDomain"))),
442 sort(BasicDBObject.parse("{count : -1}"))
443 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
444
445 // should only have one doc for NZ since it's a count by geolocation.
446
447 return;
448 }
449
450 /**
451 * Count of NZ (incl .nz TLD) websites containing a positive number of sentences in MRI,
452 * listing all the base domain strings (no protocol or www) in ALPHABETICAL ORDER
453 * and total counts of numPagesInMRI and numPagesContainingMRI across all these
454 * matching sites.
455 *
456 * The aggregate() we want to run this time:
457 *
458 db.Websites.aggregate([
459 {
460 $match: {
461 $and: [
462 {geoLocationCountryCode: {$ne: "NZ"}},
463 {domain: {$not: /\.nz/}},
464 {numPagesContainingMRI: {$gt: 0}},
465 {$or: [{geoLocationCountryCode: "AU"}, {urlContainsLangCodeInPath: false}]}
466 ]
467 }
468 },
469 { $unwind: "$geoLocationCountryCode" },
470 {
471 $group: {
472 _id: {$toLower: '$geoLocationCountryCode'},
473 count: { $sum: 1 },
474 domain: { $addToSet: '$basicDomain' } // domain: {$push: "$basicDomain" }
475 }
476 },
477 { $sort : { count : -1} }
478 ]);
479 */
480 public void aggregateContainsMRIForOverseas(Writer writer, int filterType,
481 boolean isMiInURLPath) throws UncheckedIOException
482 {
483 // working with the WebSites collection, not WebPages collection!
484 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
485
486 String mriFilterString = (filterType == CONTAINS_MRI) ? "{numPagesContainingMRI: {$gt: 0}}" : "{numPagesInMRI: {$gt: 0}}";
487
488 Bson orQuery = or(
489 BasicDBObject.parse("{geoLocationCountryCode: \"AU\"}"),
490 BasicDBObject.parse("{urlContainsLangCodeInPath: "+ isMiInURLPath +"}")
491 // e.g. "{urlContainsLangCodeInPath: false}"
492 );
493 Bson andQuery = and(
494 BasicDBObject.parse("{geoLocationCountryCode: {$ne: \"NZ\"}}"),
495 BasicDBObject.parse("{domain: {$not: /\\.nz/}}"),
496 BasicDBObject.parse(mriFilterString),
497 orQuery);
498
499 collection.aggregate(Arrays.asList(
500 match(andQuery), //match(BasicDBObject.parse(matchQuery))
501 // match((List<DBObject>)JSON.parse(matchQuery)),
502 unwind("$geoLocationCountryCode"),
503 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1),
504 addToSet("domain", "$basicDomain"))),
505 sort(BasicDBObject.parse("{count : -1}"))
506 )).forEach((Block<Document>)doc -> writeDoc(doc, writer));
507
508 // casting to Block<Document> necessary because otherwise we see the error at
509 // https://stackoverflow.com/questions/47979978/ambiguous-reference-to-foreach-when-listing-mongodbs-database-in-java
510
511 // Less efficient way is to keep all the results in memory and then
512 // write them out one at a time
513 /*
514 AggregateIterable<Document> output
515 = collection.aggregate(Arrays.asList(
516 match(andQuery), //match(BasicDBObject.parse(matchQuery))
517 // match((List<DBObject>)JSON.parse(matchQuery)),
518 unwind("$geoLocationCountryCode"),
519 group("$geoLocationCountryCode", Arrays.asList(sum("count", 1), addToSet("domain", "$domain"))),
520 sort(BasicDBObject.parse("{count : -1}"))
521 ));
522
523
524 for (Document doc : output) {
525 //System.out.println(doc);
526 System.out.println(doc.toJson());
527
528 }
529 */
530 return;
531 }
532
533 /** Do the aggregates for writing out tables.
534 Table1:
535
536 */
537 public void writeTables(File outFolder) {
538 // In this function, we're always dealing with the Websites mongodb collection.
539 MongoCollection<Document> collection = this.database.getCollection(WEBSITES_COLLECTION);
540
541 String[] tableNames = { "", "1table_allCrawledSites", "2table_sitesWithPagesInMRI"};
542 for (int tableNum = 1; tableNum < tableNames.length; tableNum++) {
543 File outFile = new File(outFolder, tableNames[tableNum] + ".json");
544 File csvFile = new File(outFolder, tableNames[tableNum] + ".csv");
545 try (
546 Writer writer = new BufferedWriter(new FileWriter(outFile));
547 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(csvFile), CSVFormat.DEFAULT);
548 ) {
549
550 // Write out the CSV column headings
551 // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVPrinter.html
552 csvWriter.printRecord("countryCode", "siteCount",
553 "numPagesInMRI count","numPagesContainingMRICount"/*, "domain"*/);
554
555 AggregateIterable<Document> output = getTable(collection, tableNum); //doTable1().forEach((Block<Document>)doc -> writeDoc(doc, writer));
556
557 int docNum = 0;
558 for (Document doc : output) {
559 //System.out.println(doc);
560 writeDocAsJsonRecord(++docNum, doc, writer);
561 writeDocAsCSVRecord(++docNum, doc, csvWriter);
562 }
563 logger.info("@@@ Wrote out table into file: " + Utility.getFilePath(outFile) + " and .csv");
564 } catch(UncheckedIOException ioe) {
565 logger.error("Caught UncheckedIOException: " + ioe.getMessage(), ioe);
566 }
567 catch(Exception e) {
568 logger.error("Could not write table to file " + outFile + " or .csv equivalent" , e);
569 }
570 }
571 }
572
573 public AggregateIterable<Document> getTable(MongoCollection<Document> collection, int tableNum) {
574
575 AggregateIterable<Document> output = null;
576
577 switch(tableNum) {
578
579 case 1:
580 /* 1table_allCrawledSites -
581
582 db.Websites.aggregate([
583 { $unwind: "$geoLocationCountryCode" },
584 {
585 $group: {
586 _id: "$geoLocationCountryCode",
587 count: { $sum: 1 },
588 //domain: { $addToSet: '$domain' },
589 numPagesInMRICount: { $sum: '$numPagesInMRI' },
590 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
591 }
592 },
593 { $sort : { count : -1} }
594 ]);
595 */
596 output = collection.aggregate(Arrays.asList(
597 //match(BasicDBObject.parse("{urlContainsLangCodeInPath:true}")),
598 unwind("$geoLocationCountryCode"),
599 group("$geoLocationCountryCode", Arrays.asList(
600 sum("count", 1),
601 /*addToSet("domain", "$domain"),*/
602 sum("numPagesInMRICount", "$numPagesInMRI"),
603 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
604 sort(BasicDBObject.parse("{count : -1}"))
605 ));
606 break;
607
608 case 2:
609 /*
610 db.Websites.aggregate([
611 { $match: { numPagesInMRI: {$gt: 0} } },
612 { $unwind: "$geoLocationCountryCode" },
613 {
614 $group: {
615 _id: {$toLower: '$geoLocationCountryCode'}, // ignore toLower
616 count: { $sum: 1 },
617 //domain: { $addToSet: '$domain' },
618 numPagesInMRICount: { $sum: '$numPagesInMRI' },
619 numPagesContainingMRICount: { $sum: '$numPagesContainingMRI' }
620 }
621 },
622 { $sort : { count : -1} }
623 ]);
624 */
625 output = collection.aggregate(Arrays.asList(
626 match(BasicDBObject.parse("{ numPagesInMRI: {$gt: 0} }")),
627 unwind("$geoLocationCountryCode"),
628 group("$geoLocationCountryCode", Arrays.asList(
629 sum("count", 1),
630 /*addToSet("domain", "$domain"),*/
631 sum("numPagesInMRICount", "$numPagesInMRI"),
632 sum("numPagesContainingMRICount", "$numPagesContainingMRI"))),
633 sort(BasicDBObject.parse("{count : -1}"))
634 ));
635 break;
636
637 default: logger.error("Unknown table number: " + tableNum);
638
639 }
640
641 return output;
642
643 }
644
645
646
647 /**
648 * called by lambda forEach() call on Document objects to write them out to a file.
649 * Have to deal with unreported exceptions here that can't be dealt with when doing
650 * the actual forEach(). See
651 * https://stackoverflow.com/questions/39090292/how-to-cleanly-deal-with-unreported-exception-ioexception-in-stream-foreach
652 */
653 public void writeDoc(Document doc, Writer writer) throws UncheckedIOException {
654
655 // If there's a domain field in the json Doc, sort this domain listing alphabetically
656 Object domainList = doc.remove("domain");
657 if(domainList != null) {
658 doc.put("domain", sortAlphabetically(domainList));
659 }
660
661 //OLD WAY: writer.write(doc.toJson(new JsonWriterSettings(JsonMode.STRICT, true)) + NEWLINE);
662 // Can't control json output to add newlines after each array element,
663 // no matter which JsonMode is used.
664
665 // https://mongodb.github.io/mongo-java-driver/3.9/javadoc/index.html?org/bson/json/JsonWriterSettings.html
666 // Still can't control array element output,
667 // but this way uses newer mongo java driver 3.9(.1). Tried its various JsonModes too:
668 //JsonWriterSettings writeSettings = new JsonWriterSettings();
669 //writeSettings.builder().outputMode(JsonMode.SHELL).indent(true).build();
670 //writer.write(doc.toJson(writeSettings) + NEWLINE);
671
672 // Not the JsonWriter of mongodb java driver:
673 // https://stackoverflow.com/questions/54746814/jsonwriter-add-a-new-line
674
675 // Have to use gson's pretty print to produce a json string that contains
676 // newlines after every array element in the json:
677
678 String jsonStr = prettyPrintJson(doc.toJson());
679 //System.err.println(jsonStr);
680 try {
681 writer.write(jsonStr + NEWLINE);
682 } catch (IOException ex) {
683 //throw ex;
684 throw new UncheckedIOException(ex);
685 }
686 }
687
688 private List sortAlphabetically(Object list) {
689 BsonArray domainList = (BsonArray)list;
690 //for(String domain : domainList) {
691 for(int i = domainList.size() - 1; i >= 0; i--) {
692 BsonString domain = domainList.get(i).asString();
693 String domainStr = Utility.stripProtocolAndWWWFromURL(domain.toString());
694 domainList.set(i, new BsonString(domainStr));
695 }
696
697 return domainList;
698 }
699
700 public void writeDocAsJsonRecord(int docNum, Document doc, Writer writer) throws UncheckedIOException {
701 String jsonStr = prettyPrintJson(doc.toJson());
702 //System.err.println(jsonStr);
703 try {
704 writer.write("/* " + docNum + " */\n" + jsonStr + NEWLINE);
705 } catch (IOException ex) {
706 //throw ex;
707 throw new UncheckedIOException(ex);
708 }
709 }
710
711 // TODO
712 //public void writeDocToJsonAndCSV(int docNum, Document doc, Writer writer, CSVPrinter csvWriter) throws UncheckedIOException {
713 public void writeDocAsCSVRecord(int docNum, Document doc, CSVPrinter csvWriter) throws UncheckedIOException {
714 String jsonStr = doc.toJson();
715 JsonParser parser = new JsonParser();
716 JsonElement json = parser.parse(jsonStr);
717
718 JsonObject jsonObj = (JsonObject)json;
719
720 String countryCode = jsonObj.get("_id").getAsString();
721 int siteCount = jsonObj.get("count").getAsInt();
722 int numPagesInMRICount = jsonObj.get("numPagesInMRICount").getAsInt();
723 int numPagesContainingMRICount = jsonObj.get("numPagesContainingMRICount").getAsInt();
724
725 //System.err.println(jsonStr);
726 try {
727 //writer.write("/* " + docNum + " */\n" + prettyPrintJson(jsonStr) + NEWLINE);
728 csvWriter.printRecord(countryCode, siteCount, numPagesInMRICount, numPagesContainingMRICount);
729 } catch (IOException ex) {
730 //throw ex;
731 throw new UncheckedIOException(ex);
732 }
733 }
734
735 public String prettyPrintJson(String jsonStr) {
736 Gson gson = new GsonBuilder().setPrettyPrinting().create();
737 JsonParser jp = new JsonParser();
738 JsonElement je = jp.parse(jsonStr);
739 String prettyJsonString = gson.toJson(je);
740 return prettyJsonString;
741 }
742
743
744 /** https://stackoverflow.com/questions/19938153/do-i-need-to-explicitly-close-connection */
745 public void close() {}
746
747
748 // TODO:
749 // In the database, need to ensure we have else
750 // create collection (table in RDBMS) websites, create collection webpages.
751 // The webpages collection will have sentences embedded based on my decisions from
752 // reading the series
753 // https://www.mongodb.com/blog/post/6-rules-of-thumb-for-mongodb-schema-design-part-1
754 // Then need functions:
755 // insertWebsiteDocument()
756 // insertWebpageDocument()
757
758 public static void main(String args[]) {
759 try {
760 MongoDBAccess mongodbCon = new MongoDBAccess();
761 mongodbCon.connectToDB();
762 mongodbCon.showCollections();
763
764 } catch(Exception e) {
765 e.printStackTrace();
766 }
767 }
768}
Note: See TracBrowser for help on using the repository browser.